In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

# Loading the data

In [2]:
# Loading the train and test data for pre-processing
train_df = pd.read_csv('./AskReddit Dataset/train.csv')
kaggle_test_df = pd.read_csv('./AskReddit Dataset/test.csv')
df = pd.concat([train_df,kaggle_test_df])

# Feature Extraction

In [3]:
df.head()

Unnamed: 0,qid,question_text,target
0,a3dee568776c08512c89,What is the role of Lua in Civ4?,0.0
1,bdb84f519e7b46e7b7bb,What are important chapters in Kannada for 10 ...,0.0
2,29c88db470e2eb5c97ad,Do musicians get royalties from YouTube?,0.0
3,3387d99bf2c3227ae8f1,What is the difference between Scaling Social ...,0.0
4,e79fa5038f765d0f2e7e,Why do elevators go super slow right before th...,0.0


In [4]:
df['word_count'] = df['question_text'].apply(lambda s : len(s.split(' ')))

In [5]:
df['avg_word_length'] = df['question_text'].apply(lambda s : len(s)/len(s.split(' ')))

In [6]:
df.head()

In [7]:
import seaborn as sns
fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(df.loc[:,['target','word_count','avg_word_length']].corr(),annot = True,cmap = plt.cm.viridis)
plt.show()

# Exploratory Data Analysis and pre-processing

In [8]:
# lowercase all the comments
df['question_text'] = df['question_text'].apply(lambda s : s.lower())

In [9]:
df.head()

In [10]:
# list of contractions with expanded form taken from wikipedia
contractions = {
    "ain't": 'am not',
    "aren't": 'are not',
    "'bout" : 'about',
    "can't" : 'cannot',
    "cap’n" : 'captain',
    "'cause": 'because',
    "’cept" : 'except',
    "could've":"could have",
    "couldn't":"could not",
    "couldn't've":"could not have",
    "dammit":"damn it",
    "daren't":"dare not",
    "daresn't":"dare not",
    "dasn't":"dare not",
    "didn't":"did not",
    "doesn't":"does not",
    "don't":"do not",
    "dunno":"do not know",
    "d'ye":"do you",
    "e'en":"even",
    "e'er":"ever",
    "'em":"them",
    "everybody's":"everybody is",
    "everyone's":"everyone is",
    "fo’c’sle":"forecastle",
    "’gainst":"against"}
# "g'day (informal)	good day
# gimme (informal)	give me
# giv'n (informal)	given
# gonna (informal)	going to
# gon't (informal)	go not (colloquial)
# gotta (informal)	got to
# hadn't	had not
# had've	had have
# hasn't	has not
# haven't	have not
# he'd	he had / he would
# he'll	he shall / he will
# helluva (informal)	hell of a
# he's	he has / he is
# here's	here is
# how'd (informal)	how did / how would
# howdy (informal)	how do you do / how do you fare
# how'll	how will
# how're	how are
# how's	how has / how is / how does
# I'd	I had / I would
# I'd've	I would have
# I'd'nt	I would not
# I'd'nt've	I would not have
# I'll	I shall / I will
# I'm	I am
# Imma (informal)	I am about to/I am going to
# I'm'o (informal)	I am going to
# innit (informal)	is it not
# Ion (informal)	I don't / I do not
# I've	I have
# isn't	is not
# it'd	it would
# it'll	it shall / it will
# it's	it has / it is
# Idunno (informal)	I don't know
# kinda (informal)	kind of
# let's	let us
# ma'am (formal)	madam
# mayn't	may not
# may've	may have
# methinks (informal)	I think
# mightn't	might not
# might've	might have
# mustn't	must not
# mustn't've	must not have
# must've	must have
# ‘neath (informal)	beneath
# needn't	need not
# nal (informal)	and all
# ne'er (informal)	never
# o'clock	of the clock
# o'er	over
# ol'	old
# oughtn't	ought not
# ‘round	around
# 's	is, has, does, or us
# shalln't	shall not (archaic)
# shan't	shall not
# she'd	she had / she would
# she'll	she shall / she will
# she's	she has / she is
# should've	should have
# shouldn't	should not
# shouldn't've (informal)	should not have
# somebody's	somebody has / somebody is
# someone's	someone has / someone is
# something's	something has / something is
# so're (informal)	so are (colloquial)
# so’s (informal)	so is / so has
# so’ve (informal)	so have
# that'll	that shall / that will
# that're (informal)	that are
# that's	that has / that is
# that'd	that would / that had
# there'd	there had / there would
# there'll	there shall / there will
# there're	there are
# there's	there has / there is
# these're	these are
# these've	these have
# they'd	they had / they would
# they'll	they shall / they will
# they're	they are / they were
# they've	they have
# this's	this has / this is
# those're (informal)	those are
# those've (informal)	those have
# 'thout (informal)	without
# ’til (informal)	until
# 'tis (informal)	it is
# to've (informal)	to have
# 'twas (informal)	it was
# 'tween (informal)	between
# 'twere (informal)	it were
# wanna	want to
# wasn't	was not
# we'd	we had / we would/ we did
# we'd've	we would have
# we'll	we shall / we will
# we're	we are
# we've	we have
# weren't	were not
# whatcha	what are you (whatcha doing?)
# what about you (as in asking how someone is today, used as a greeting)

# what'd	what did
# what'll	what shall / what will
# what're	what are/what were
# what's	what has / what is / what does
# what've	what have
# when's	when has / when is
# where'd	where did
# where'll	where shall / where will
# where're	where are
# where's	where has / where is / where does
# where've	where have
# which'd	which had / which would
# which'll	which shall / which will
# which're	which are
# which's	which has / which is
# which've	which have
# who'd	who would / who had / who did
# who'd've	who would have
# who'll	who shall / who will
# who're	who are
# who's	who has / who is / who does
# who've	who have
# why'd	why did
# why're	why are
# why's	why has / why is / why does
# willn't	will not (archaic)
# won't	will not
# wonnot	will not (archaic)
# would've	would have
# wouldn't	would not
# wouldn't've	would not have
# y'all	you all (colloquial/Southern American English)
# y'all'd've	you all would have (colloquial/Southern American English)
# y'all'd'n't've	you all would not have (colloquial/Southern American English)
# y'all're	you all are (colloquial/Southern American English)
# y'all'ren't	you all are not (colloquial/Southern American English)
# y'at (informal)	you at
# yes’m	yes ma’am / yes madam
# yessir	yes sir
# you'd	you had / you would
# you'll	you shall / you will
# you're	you are
# # you've	you have
# }

In [11]:
# Expanding the contractions to get the actual words
def expand_contractions(s):
    for key in contractions:
        replacement = contractions[key]
        s = s.replace(key,replacement)
    return s

In [12]:
# Remove punctuation marks and any other numbers or special characters from the sentence
df['question_text'] = df['question_text'].apply(lambda x : re.sub('[^a-zA-Z]',' ',x))

In [13]:
# Remove extra spaces
df['question_text'] = df['question_text'].apply(lambda x : ' '.join(x.split()))

In [14]:
df.head()

## Stop words removal

In [15]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print(stop_words)

In [16]:
# Remove stop words from text
df['question_text'] = df['question_text'].apply(lambda s : ' '.join([ w for w in s.split() if not w in stop_words]))

In [17]:
df.head()

## Tokenization of text

In [18]:
from nltk.tokenize import word_tokenize
df['tokenized'] = df['question_text'].apply(word_tokenize)

In [19]:
df.head()

## Lemmatization of words

In [20]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatization on the tokenized words
df['lemmatized'] = df['tokenized'].apply(lambda words : ' '.join([lemmatizer.lemmatize(w) for w in words]))

In [21]:
df.head()

In [22]:
neg_df = df.loc[df['target'] == 1] # dataframe of only troll questions

In [23]:
pos_df = df.loc[df['target']== 0] # dataframe of only non-troll questions

## WordCloud

In [24]:
from wordcloud import WordCloud

# generating word cloud of troll questions
neg_word_cloud = WordCloud().generate(' '.join(neg_df['lemmatized']))

# generating word cloud of non-troll questions
pos_word_cloud = WordCloud().generate(' '.join(pos_df['lemmatized']))

In [25]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.imshow(neg_word_cloud)
plt.axis('off')
plt.show()

In [26]:
plt.imshow(pos_word_cloud)
plt.axis('off')
plt.show()

# Vectorizers

## 1. CountVectorizer

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 2000)

In [28]:
X_cv = cv.fit_transform(df['lemmatized'])

In [29]:
cv.get_feature_names()

## 2. Tf-Idf Vectorizer

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features = 2000)
X_tv = tv.fit_transform(list(df['lemmatized']))

In [31]:
tv.get_feature_names()

## 3. Word2Vec word embedding

In [32]:
df['lemmatized'] = df['lemmatized'].apply(lambda s : s.split(' '))    # for word2vec we need lists of words

In [33]:
df.head()

In [34]:
from gensim.models import Word2Vec

In [35]:
w2v_model = Word2Vec(min_count=1,window = 2)

In [36]:
w2v_model.build_vocab(list(df['lemmatized']))

In [37]:
w2v_model.epochs

In [38]:
w2v_model.train(list(df['lemmatized']),total_examples = w2v_model.corpus_count,epochs = w2v_model.epochs)

In [39]:
w2v_model.wv.most_similar('trump')

In [40]:
# get the vectors for each question by adding the vectors of the words in it

# final_vectors = []

def get_sent_vec(model,words):
        sent_vec = np.zeros((model.wv.vector_size,))
#     print(sent_vec.shape)
        for word in words:
#         print(word)
#         print(w2v_model.wv.get_vector(word).shape)
            sent_vec += model.wv.get_vector(word)
        return sent_vec
#     print(sent_vec)
#     final_vectors.append(sent_vec)
# final_vectors = np.asarray(final_vectors)
# vec = w2v_model.wv.get_vector(vocab[1][0]) + w2v_model.wv.get_vector(vocab[1][1]) + w2v_model.wv.get_vector(vocab[1][2])

In [41]:
df['word2vec_vector'] = df['lemmatized'].apply(lambda x:get_sent_vec(w2v_model,x))

In [42]:
df.head()

In [43]:
print(list(df['word2vec_vector'])[0])

In [44]:
# generate a dataframe with each component of the vector as a column
X_w2v = pd.DataFrame(columns = ['vec' + str(i) for i in range(w2v_model.wv.vector_size)],data = list(df['word2vec_vector']))

In [45]:
X_w2v.head()

## Separating the train and test data given by kaggle

In [46]:
# The pre-processed kaggle test dataset
test_df_cv = X_cv[train_df.shape[0]:]
test_df_tv = X_tv[train_df.shape[0]:]
test_df_w2v = X_w2v[train_df.shape[0]:]

In [47]:
# The pre-processed kaggle train dataset 
train_df_cv = X_cv[0:train_df.shape[0]]
train_df_tv = X_tv[0:train_df.shape[0]]
train_df_w2v = X_w2v[0:train_df.shape[0]]

In [48]:
# target values of the train dataset
y = df[df['target'].isna() == False]
y = y['target']
print(y)

# Splitting the dataset

In [49]:
from sklearn.model_selection import train_test_split
X_train_cv,X_test_cv,y_train_cv,y_test_cv = train_test_split(train_df_cv,y,train_size=0.80,random_state=45)
X_train_tv,X_test_tv,y_train_tv,y_test_tv = train_test_split(train_df_tv,y,train_size=0.80,random_state=45)
X_train_w2v,X_test_w2v,y_train_w2v,y_test_w2v = train_test_split(train_df_w2v,y,train_size=0.80,random_state=45)

# Oversampling the training dataset

In [50]:
plt.hist(y)

In [51]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)

In [52]:
X_ros_cv,y_ros_cv = ros.fit_resample(X_train_cv,y_train_cv)
X_ros_tv,y_ros_tv = ros.fit_resample(X_train_tv,y_train_tv)
X_ros_w2v,y_ros_w2v = ros.fit_resample(X_train_w2v,y_train_w2v)

In [53]:
print(y_ros_cv.value_counts(),y_train_cv.value_counts())

# Training models

## 1. Logistic Regression with GridSearchCV

In [None]:
type(X_ros_cv)

In [None]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(solver = 'liblinear')

In [None]:
log_params = [{'solver' : ['liblinear'],'penalty' : ['l2'],'C' : [100, 10, 1.0, 0.1, 0.01,0.001]}]

In [None]:
gscv = GridSearchCV(estimator = log_model,param_grid = log_params,cv = 5,scoring = 'f1')

### 1.1 On vectors generated using CountVectorizer

In [None]:
gscv.fit(X_ros_cv,y_ros_cv)

In [None]:
print('Best params:', gscv.best_params_)
print('Best f1 score on train data:',gscv.best_score_)
print('f1 score on test data:',gscv.score(X_test_cv,y_test_cv))

### 1.2 On vectors generated using Tf-IdfVectorizer

In [None]:
gscv.fit(X_ros_tv,y_ros_tv)

In [None]:
print('Best params:', gscv.best_params_)
print('Best f1 score:',gscv.best_score_)
print('f1 score on test data:',gscv.score(X_test_tv,y_test_tv))

### 1.3 On vectors generated using Word2Vec word embedding

In [None]:
gscv.fit(X_ros_w2v,y_ros_w2v)

In [None]:
print('Best params:', gscv.best_params_)
print('Best f1 score:',gscv.best_score_)
print('f1 score on test data:',gscv.score(X_test_w2v,y_test_w2v))

## 2. Decision Tree Classifier with GridSearchCV

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()

In [None]:
dt_params = [{'max_depth' : [10,20,50,100],'min_impurity_decrease' : [0.1,0.2,0.3]}]

In [None]:
gscv = GridSearchCV(estimator = dt_model,param_grid = dt_params,cv = 5,scoring = 'f1')

### 2.1  On vectors generated using CountVectorizer

In [None]:
gscv.fit(X_ros_cv,y_ros_cv)

In [None]:
print('Best params:', gscv.best_params_)
print('Best f1 score:',gscv.best_score_)
print('f1 score on test data:',gscv.score(X_test_cv,y_test_cv))

### 2.2 On vectors generated using Tf-IdfVectorizer

In [None]:
gscv.fit(X_ros_tv,y_ros_tv)

In [None]:
print('Best params:', gscv.best_params_)
print('Best f1 score:',gscv.best_score_)
print('f1 score on test data:',gscv.score(X_test_tv,y_test_tv))

### 2.3 On vectors generated using Word2Vec word embedding

In [None]:
gscv.fit(X_ros_w2v,y_ros_w2v)

In [None]:
print('Best params:', gscv.best_params_)
print('Best f1 score:',gscv.best_score_)
print('f1 score on test data:',gscv.score(X_test_w2v,y_test_w2v))

## Random Forest Classifier with GridSearchCV

In [54]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()

In [55]:
rf_params = [{'n_estimators':[10,20],'criterion':['gini','entropy'],'max_depth' : [10,20,50],'min_impurity_decrease' : [0.1,0.2]}]

In [56]:
gscv = GridSearchCV(estimator = rf_model,param_grid = rf_params,cv = 5,scoring = 'f1')

### 3.1  On vectors generated using CountVectorizer

In [57]:
gscv.fit(X_ros_cv,y_ros_cv)

In [58]:
print('Best params:', gscv.best_params_)
print('Best f1 score:',gscv.best_score_)
print('f1 score on test data:',gscv.score(X_test_cv,y_test_cv))

### 2.2 On vectors generated using Tf-IdfVectorizer

In [None]:
gscv.fit(X_ros_tv,y_ros_tv)

In [None]:
print('Best params:', gscv.best_params_)
print('Best f1 score:',gscv.best_score_)
print('f1 score on test data:',gscv.score(X_test_tv,y_test_tv))

### 2.3 On vectors generated using Word2Vec word embedding

In [None]:
gscv.fit(X_ros_w2v,y_ros_w2v)

In [None]:
print('Best params:', gscv.best_params_)
print('Best f1 score:',gscv.best_score_)
print('f1 score on test data:',gscv.score(X_test_w2v,y_test_w2v))

## 4. Gaussian Naive Bayes

### On Vectors with Word2Vec word embedding

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb_model = GaussianNB()
gnb_model = gnb_model.fit(X_ros_w2v, y_ros_w2v)

In [None]:
y_pred = gnb_model.predict(X_test_w2v)

In [None]:
print(classification_report(y_test_w2v, y_pred))

## Gradient Boost Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

N_ESTIMATORS = 500
LEARNING_RATE = 0.5
MAX_DEPTH = 7
RANDOM_STATE = 1

In [None]:
clf = GradientBoostingClassifier(n_estimators=N_ESTIMATORS, learning_rate=LEARNING_RATE, max_depth=MAX_DEPTH, random_state=RANDOM_STATE)
clf = clf.fit(X_ros_w2v,y_ros_w2v)

In [None]:
y_pred = clf.predict(X_test_w2v)
y_pred

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test_w2v, y_pred))

## XGBoost

In [None]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier(random_state=100)
xgb_model.fit(X_ros_w2v, y_ros_w2v)

In [None]:
y_pred = xgb_model.predict(X_test_w2v)
y_pred

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test_w2v, y_pred))

## Support Vector Classifier with recursive feature elimination and cross validation

In [68]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

In [None]:
svc = SVC(kernel="linear")
min_features_to_select = 1  # Minimum number of features to consider


rfecv = RFECV(
    estimator=svc,
    step=1,
    cv=StratifiedKFold(2),
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
)
rfecv = rfecv.fit(X_ros_w2v, y_ros_w2v)

In [None]:
y_pred = rfecv.predict(X_test_w2v)
y_pred

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test_w2v, y_pred))

# Perceptron

In [63]:
from sklearn.linear_model import Perceptron
clf = Perceptron(tol=1e-3, random_state=0)
clf = clf.fit(X_ros_w2v, y_ros_w2v)

In [64]:
y_pred = clf.predict(X_test_w2v)
y_pred

In [65]:
from sklearn.metrics import classification_report
print(classification_report(y_test_w2v, y_pred))

## Ridge Classifier

In [59]:
from sklearn.linear_model import RidgeClassifier

In [60]:
ridge_clf = RidgeClassifier()
ridge_clf.fit(X_ros_w2v,y_ros_w2v)

In [61]:
y_pred = ridge_clf.predict(X_test_w2v)

In [62]:
print(y_pred)
from sklearn.metrics import classification_report
print(classification_report(y_test_w2v, y_pred))

## Adaboost

In [66]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=10,learning_rate=0.1,random_state=0)
clf = clf.fit(X_ros_w2v,y_ros_w2v)
y_pred = clf.predict(X_test_w2v)

In [67]:
print(y_pred)
print(classification_report(y_test_w2v, y_pred))