### Tokenization and Removing Stopwords

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout

Using TensorFlow backend.


In [3]:
rawData = pd.read_csv(r'C:/Users/vatsa/OneDrive/Desktop/Reddit-Flair-Detection/RedditData/Data/balanced.csv')

rawData.drop(['Url', 'Score', 'Publish Date'
             ,'Total No. of Comments', 'Body'], axis = 1, inplace=True) 

rawData = rawData[rawData['flair'].notna()] # Dropping all submissions that don't have a tagged flair

rawData.head(2)

Unnamed: 0,Title,flair,text
0,TIL: The British named their emergency plans f...,Non-Political,TIL: The British named their emergency plans f...
1,[NSFW] is there decent desi porn online?,AskIndia,[NSFW] is there decent desi porn online? Shoul...


### Text Cleaning

In [4]:
bad_chars = ['/r', 'r/', '[R]'] 
for i in bad_chars : 
    rawData['flair'] = rawData['flair'].str.replace(i, '')

In [5]:
# def remove_punctuation(text):
#     no_punct = "".join([c for c in text if c not in string.punctuation])
#     return no_punct

# rawData['Title'] = rawData['Title'].apply(lambda x:remove_punctuation(x))
# rawData['text'] = rawData['text'].apply(lambda x:remove_punctuation(x))
# rawData['flair'] = rawData['flair'].apply(lambda x:remove_punctuation(x))

In [6]:
# tokenizer = RegexpTokenizer(r'\w+')
# rawData['text'] = rawData['text'].apply(lambda x: tokenizer.tokenize(x.lower()))
# rawData['flair'] = rawData['flair'].apply(lambda x: tokenizer.tokenize(x.lower()))

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = str(text)
    text = text.lower() 
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = text.replace('x', '')
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

rawData['Title'] = rawData['Title'].apply(clean_text)
rawData['text'] = rawData['text'].apply(clean_text)
rawData['flair'] = rawData['flair'].apply(clean_text)

In [7]:
rawData['flair'].value_counts()
# rawData = rawData.groupby('flair').head(2606).reset_index(drop=True)
# rawData['flair'].value_counts()

nonpolitical        68547
politics            59255
askindia            28205
policy economy      13015
business finance     6647
Name: flair, dtype: int64

### LSTM

In [8]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(rawData['Title'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = tokenizer.texts_to_sequences(rawData['Title'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Y = pd.get_dummies(rawData['flair']).values
print('Shape of label tensor:', Y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1, random_state = 42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

Found 69438 unique tokens.
Shape of data tensor: (175669, 250)
Shape of label tensor: (175669, 5)
(158102, 250) (158102, 5)
(17567, 250) (17567, 5)


In [9]:
from keras.models import Model
from keras.layers import Input
from keras.layers.merge import concatenate

In [11]:
# del model

In [13]:
# define input
visible = Input(shape=(X.shape[1],))

emb = Embedding(MAX_NB_WORDS, EMBEDDING_DIM)(visible)

# feature extraction
extract1 = LSTM(10)(emb)
# first interpretation model
interp21 = Dense(10, activation='relu')(extract1)
interp22 = Dense(20, activation='relu')(interp21)
drp = Dropout(0.25)(interp22)
interp23 = Dense(10, activation='relu')(drp)
# second interpretation model
interp11 = Dense(10, activation='relu')(extract1)
interp12 = Dense(20, activation='relu')(interp11)
drp = Dropout(0.25)(interp11)
interp13 = Dense(10, activation='relu')(drp)
# merge interpretation
merge = concatenate([interp23, interp13])
# output
output = Dense(5, activation='softmax')(merge)
model = Model(inputs=visible, outputs=output)
# summarize layers
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 250)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 250, 100)     5000000     input_2[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 10)           4440        embedding_2[0][0]                
__________________________________________________________________________________________________
dense_8 (Dense)                 (None, 10)           110         lstm_2[0][0]                     
____________________________________________________________________________________________

In [11]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(150, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(100,activation='relu'))
model.add(Dense(50,activation='relu'))
model.add(Dense(5, activation='softmax'))

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 250, 100)          5000000   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 250, 100)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 150)               150600    
_________________________________________________________________
dense_4 (Dense)              (None, 100)               15100     
_________________________________________________________________
dense_5 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_6 (Dense)              (None, 5)                 255       
Total params: 5,171,005
Trainable params: 5,171,005
Non-trainable params: 0
____________________________________________

In [13]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 64

In [14]:
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 142291 samples, validate on 15811 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Factorization

In [15]:
from io import StringIO

rawData['flair_id'] = rawData['flair'].factorize()[0]
category_id_df = rawData[['flair', 'flair_id']].drop_duplicates().sort_values('flair_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['flair', 'flair_id']].values)
rawData.head(2)

Unnamed: 0,Title,flair,text,flair_id
0,amazon peer peer selling np,nonpolitical,amazon peer peer selling np,0
1,people went partying tonight,askindia,people went partying tonight dear randians sin...,1


### Train-Test Split

In [16]:
from sklearn.model_selection import train_test_split

X = rawData.text
y = rawData.flair_id
#y = pd.get_dummies(rawData['flair']).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 100)

### Vectorization

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(stop_words='english').fit(X_train)

tfidf_train = vect.transform(X_train)
tfidf_test = vect.transform(X_test)

X_train_vect = pd.DataFrame(tfidf_train.todense(), columns=vect.get_feature_names())
X_test_vect = pd.DataFrame(tfidf_test.todense(), columns=vect.get_feature_names())

# Used for fitting on the whole model later
total_vect = TfidfVectorizer(stop_words='english').fit(rawData['text'])
tfidf = total_vect.transform(rawData['text'])
rawData_vect = pd.DataFrame(tfidf.todense(), columns=total_vect.get_feature_names())

MemoryError: Unable to allocate array with shape (108415, 124655) and data type float64

In [None]:
print("X_train_vect has {} rows and {} columns and y_train also has {} rows."
      .format(X_train_vect.shape[0], X_train_vect.shape[1], y_train.shape[0]))

print("X_test_vect has {} rows and {} columns and y_train also has {} rows."
      .format(X_test_vect.shape[0], X_test_vect.shape[1], y_test.shape[0]))

### Classification

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

clf = MultinomialNB().fit(X_train_vect, y_train)

y_pred = clf.predict(X_test_vect)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.5882578664620107
              precision    recall  f1-score   support

           0       0.53      0.91      0.67       517
           1       0.65      0.66      0.66       530
           2       0.59      0.32      0.42       509
           3       0.61      0.72      0.66       505
           4       0.59      0.34      0.43       545

    accuracy                           0.59      2606
   macro avg       0.59      0.59      0.57      2606
weighted avg       0.60      0.59      0.57      2606



In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import accuracy_score, classification_report

# searcher = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# # tolerance = [0.01, 0.001, 0.0001, 0.00001]
# # m_iter = [50, 100, 150, 200, 250]
# # c = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
# # sol = ["newton-cg", "lbfgs", "sag"]

# # tolerance = [0.01]
# # m_iter = [50]
# # c = [0.001]
# # sol = ["newton-cg"]

# # grid = dict(tol = tolerance, 
# #             max_iter = m_iter,
# #             C = c,
# #             solver=sol)

# # searcher = GridSearchCV(classifier, 
# #                         param_grid = grid,
# #                         cv = 2, refit = True, verbose=1, n_jobs=-1)

# searcher.fit(X_train_vect, y_train)

# y_pred = searcher.predict(X_test_vect)

# print('accuracy %s' % accuracy_score(y_pred, y_test))
# print(classification_report(y_test, y_pred))

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, classification_report
# from sklearn.model_selection import GridSearchCV

# from pactools import simulate_pac
# from pactools.grid_search import ExtractDriver, AddDriverDelay
# from pactools.grid_search import DARSklearn, MultipleArray
# from pactools.grid_search import GridSearchCVProgressBar

# rf = RandomForestClassifier()

# est = [50, 100, 150, 200, 250, 300, 350, 400]
# depth = [None, 5, 7, 9, 11, 13, 15]

# grid = dict(n_estimators = est, 
#             max_depth = depth)

# # searcher = GridSearchCV(rf, param_grid = grid,
# #                         cv = 2, refit = True, verbose=5, n_jobs=-1)

# searcher.fit(X_train_vect, y_train)

# y_pred = searcher.predict(X_test_vect)

# print(searcher.best_params_)

# print('accuracy %s' % accuracy_score(y_pred, y_test))
# print(classification_report(y_test, y_pred))

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, classification_report

# rf = RandomForestClassifier(n_estimators = 100).fit(X_train_vect, y_train)

# y_pred = rf.predict(X_test_vect)

# print('accuracy %s' % accuracy_score(y_pred, y_test))
# print(classification_report(y_test, y_pred))

In [None]:
# from xgboost import XGBClassifier 
# from sklearn.metrics import accuracy_score, classification_report

# clf = XGBClassifier(objective='multi:softmax')

# clf.fit(X_train_vect, y_train, verbose=True)

# y_pred = clf.predict(X_test_vect)

# print('accuracy %s' % accuracy_score(y_pred, y_test))
# print(classification_report(y_test, y_pred))

In [None]:
# from sklearn.linear_model import SGDClassifier
# from sklearn.model_selection import GridSearchCV

# sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-4, random_state=42, max_iter=18, tol=None)

# sgd.fit(X_train_vect, y_train)

# y_pred = sgd.predict(X_test_vect)

# print('accuracy %s' % accuracy_score(y_pred, y_test))
# print(classification_report(y_test, y_pred))

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

sgd = SGDClassifier(loss='hinge', penalty='l2', random_state=42, tol=None)

grid = dict(alpha = [0.1, 0.01, 0.001, 0.0001, 0.00001], 
            max_iter = [10, 20, 50, 100])

searcher = GridSearchCV(sgd, param_grid = grid,
                        cv = 2, refit = True, verbose=10, n_jobs=-1)

searcher.fit(X_train_vect, y_train)

y_pred = searcher.predict(X_test_vect)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

Fitting 2 folds for each of 20 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 10.0min
