### Preprocessing

Importing required libraries and loading the dataset:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout

In [2]:
rawData = pd.read_csv("/Users/skhiearth/Desktop/Reddit-Flair-Detection/RedditData/Data/balanced.csv")

rawData.drop(['Url', 'Score', 'Publish Date'
             ,'Total No. of Comments', 'Body'], axis = 1, inplace=True)

rawData = rawData[rawData['flair'].notna()] # Dropping all submissions that don't have a tagged flair

rawData.head(2)

Unnamed: 0,Title,flair,text
0,TIL: The British named their emergency plans f...,Non-Political,TIL: The British named their emergency plans f...
1,[NSFW] is there decent desi porn online?,AskIndia,[NSFW] is there decent desi porn online? Shoul...


### Text Cleaning

Remove Reddit tags and other features that might be present in the flair to make it easily understandable for people not familiar with Reddit.

In [3]:
bad_chars = ['/r', 'r/', '[R]'] 
for i in bad_chars : 
    rawData['flair'] = rawData['flair'].str.replace(i, '')

Remove punctuations from the text:

In [4]:
def remove_punctuation(text):
    text = str(text)
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

rawData['Title'] = rawData['Title'].apply(lambda x:remove_punctuation(x))
rawData['text'] = rawData['text'].apply(lambda x:remove_punctuation(x))
rawData['flair'] = rawData['flair'].apply(lambda x:remove_punctuation(x))

In [5]:
# tokenizer = RegexpTokenizer(r'\w+')
# rawData['text'] = rawData['text'].apply(lambda x: tokenizer.tokenize(x.lower()))
# rawData['flair'] = rawData['flair'].apply(lambda x: tokenizer.tokenize(x.lower()))

# REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
# BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
# STOPWORDS = set(stopwords.words('english'))
# ps = nltk.PorterStemmer() # Defining the Porter Stemmer
# wn = nltk.WordNetLemmatizer() # Defining the Word Net Lemmatizer

# def clean_text(text):
#     text = str(text)
#     text = text.lower() 
#     text = REPLACE_BY_SPACE_RE.sub(' ', text)
#     text = BAD_SYMBOLS_RE.sub('', text)
#     text = text.replace('x', '')
#     text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwords from text
#     return text

# rawData['Title'] = rawData['Title'].apply(clean_text)
# rawData['text'] = rawData['text'].apply(clean_text)
# rawData['flair'] = rawData['flair'].apply(clean_text)

Remove stopwords, use lemmatizer to keep only the root of the word to clean the data.

In [6]:
stopword = nltk.corpus.stopwords.words('english') # Defining Stopwords
ps = nltk.PorterStemmer() # Defining the Porter Stemmer
wn = nltk.WordNetLemmatizer() # Defining the Word Net Lemmatizer

def clean_text(text):
    text_nopunct = "".join([char.lower() for char in text if char not in string.punctuation])
    token = re.split("\W+", text_nopunct)
    text_nostopword = [word for word in token if word not in stopword]
    #clean_text = ' '.join([ps.stem(word) for word in text_nostopword] )
    clean_text = ' '.join([wn.lemmatize(word) for word in text_nostopword])
    return clean_text

rawData['Title'] = rawData['Title'].apply(clean_text)
rawData['text'] = rawData['text'].apply(clean_text)

In [7]:
rawData['flair'].value_counts()
#rawData = rawData.groupby('flair').head(10000).reset_index(drop=True)
#rawData['flair'].value_counts()

NonPolitical       131731
Politics           117326
AskIndia            57239
PolicyEconomy       20580
BusinessFinance     19304
Name: flair, dtype: int64

### LSTM

Define the tokenizer and fit it on the data to create bag of words model. Split this bag of words to training and testing sets for model fitting and evaluation.

In [8]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(rawData['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = tokenizer.texts_to_sequences(rawData['text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Y = pd.get_dummies(rawData['flair']).values
print('Shape of label tensor:', Y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.10, random_state = 42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

Found 196174 unique tokens.
Shape of data tensor: (346180, 250)
Shape of label tensor: (346180, 5)
(311562, 250) (311562, 5)
(34618, 250) (34618, 5)


In [54]:
def a(val):
    print(Y[val])
    print(rawData['text'].values[val])
    print(rawData['flair'].values[val])
    
a(844)

[1 0 0 0 0]
ril row apart oil price play spoiler ongcs kg plan 
AskIndia


Define model architecture and fit model on training data.

In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(150, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
epochs = 5
batch_size = 64

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [9]:
import pickle
# model.save('../Models/LSTM.h5') -- Save model to disk

# with open('../Models/LSTMTokenizer.pickle', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) -- save tokenizer to disk

In [10]:
import tensorflow as tf
model = tf.keras.models.load_model('../Models/LSTM.h5')

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [11]:
y_pred = model.predict(X_test)

y_test_max = np.argmax(y_test, axis=1)
y_pred = np.argmax(y_pred, axis=1)

from sklearn.metrics import accuracy_score, classification_report
print('accuracy %s' % accuracy_score(y_pred, y_test_max))
print(classification_report(y_test_max, y_pred))

accuracy 0.6981628054769196
              precision    recall  f1-score   support

           0       0.68      0.65      0.66      5867
           1       0.59      0.53      0.56      1922
           2       0.69      0.71      0.70     13242
           3       0.51      0.43      0.47      2052
           4       0.76      0.78      0.77     11535

    accuracy                           0.70     34618
   macro avg       0.65      0.62      0.63     34618
weighted avg       0.70      0.70      0.70     34618



#### The model has an average accuracy of roughly 70% on the test set, 83% while training and 72% on the validation set.

#### After predicting labels using the test set, we have to above mentioned report.

For the class `BusinessFinance`, we have a recall of 0.78, which means that 78% of total relevant results are correctly classified by our model. It also has a precision of 0.76. The model does best on this class.

The model performs worst on the `PolicyEconomy` class, followed by the `Politics` class.

### Factorization

In [None]:
from io import StringIO

rawData['flair_id'] = rawData['flair'].factorize()[0]
category_id_df = rawData[['flair', 'flair_id']].drop_duplicates().sort_values('flair_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['flair', 'flair_id']].values)
rawData.head(2)

### Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

X = rawData.text
y = rawData.flair_id
#y = pd.get_dummies(rawData['flair']).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 100)

### Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(stop_words='english').fit(X_train)

tfidf_train = vect.transform(X_train)
tfidf_test = vect.transform(X_test)

X_train_vect = pd.DataFrame(tfidf_train.todense(), columns=vect.get_feature_names())
X_test_vect = pd.DataFrame(tfidf_test.todense(), columns=vect.get_feature_names())

# Used for fitting on the whole model later
total_vect = TfidfVectorizer(stop_words='english').fit(rawData['text'])
tfidf = total_vect.transform(rawData['text'])
rawData_vect = pd.DataFrame(tfidf.todense(), columns=total_vect.get_feature_names())

In [None]:
print("X_train_vect has {} rows and {} columns and y_train also has {} rows."
      .format(X_train_vect.shape[0], X_train_vect.shape[1], y_train.shape[0]))

print("X_test_vect has {} rows and {} columns and y_train also has {} rows."
      .format(X_test_vect.shape[0], X_test_vect.shape[1], y_test.shape[0]))

### Classification

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

clf = MultinomialNB().fit(X_train_vect, y_train)

y_pred = clf.predict(X_test_vect)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import accuracy_score, classification_report

# searcher = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# # tolerance = [0.01, 0.001, 0.0001, 0.00001]
# # m_iter = [50, 100, 150, 200, 250]
# # c = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
# # sol = ["newton-cg", "lbfgs", "sag"]

# # tolerance = [0.01]
# # m_iter = [50]
# # c = [0.001]
# # sol = ["newton-cg"]

# # grid = dict(tol = tolerance, 
# #             max_iter = m_iter,
# #             C = c,
# #             solver=sol)

# # searcher = GridSearchCV(classifier, 
# #                         param_grid = grid,
# #                         cv = 2, refit = True, verbose=1, n_jobs=-1)

# searcher.fit(X_train_vect, y_train)

# y_pred = searcher.predict(X_test_vect)

# print('accuracy %s' % accuracy_score(y_pred, y_test))
# print(classification_report(y_test, y_pred))

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, classification_report
# from sklearn.model_selection import GridSearchCV

# from pactools import simulate_pac
# from pactools.grid_search import ExtractDriver, AddDriverDelay
# from pactools.grid_search import DARSklearn, MultipleArray
# from pactools.grid_search import GridSearchCVProgressBar

# rf = RandomForestClassifier()

# est = [50, 100, 150, 200, 250, 300, 350, 400]
# depth = [None, 5, 7, 9, 11, 13, 15]

# grid = dict(n_estimators = est, 
#             max_depth = depth)

# # searcher = GridSearchCV(rf, param_grid = grid,
# #                         cv = 2, refit = True, verbose=5, n_jobs=-1)

# searcher.fit(X_train_vect, y_train)

# y_pred = searcher.predict(X_test_vect)

# print(searcher.best_params_)

# print('accuracy %s' % accuracy_score(y_pred, y_test))
# print(classification_report(y_test, y_pred))

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, classification_report

# rf = RandomForestClassifier(n_estimators = 100).fit(X_train_vect, y_train)

# y_pred = rf.predict(X_test_vect)

# print('accuracy %s' % accuracy_score(y_pred, y_test))
# print(classification_report(y_test, y_pred))

In [None]:
# from xgboost import XGBClassifier 
# from sklearn.metrics import accuracy_score, classification_report

# clf = XGBClassifier(objective='multi:softmax')

# clf.fit(X_train_vect, y_train, verbose=True)

# y_pred = clf.predict(X_test_vect)

# print('accuracy %s' % accuracy_score(y_pred, y_test))
# print(classification_report(y_test, y_pred))

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-4, random_state=42, max_iter=18, tol=None)

sgd.fit(X_train_vect, y_train)

y_pred = sgd.predict(X_test_vect)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))