In [1]:
import pandas as pd
import string
import re
import numpy as np

In [97]:
!pip install imblearn
from imblearn.over_sampling import RandomOverSampler

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.8.1-py3-none-any.whl (189 kB)
[K     |████████████████████████████████| 189 kB 3.4 MB/s eta 0:00:01
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.8.1 imblearn-0.0


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
questions = pd.read_csv("./data/labelled_clean_questions.csv")

In [12]:
text = questions["clean_tokenized"]
target = questions["target"].values

X_train, X_test, y_train, y_test = train_test_split(text, target, stratify=target, test_size=0.25)

In [32]:
X_train = X_train.values.astype('U')
X_test = X_test.values.astype('U')

TF-IDF + Random Forest Model + Oversampling

In [108]:
over_sampler = RandomOverSampler(random_state=42)
X_res, y_res = over_sampler.fit_resample(X_train.reshape(-1,1), y_train.reshape(-1,1))

In [105]:
from collections import Counter
print(f"Training target statistics: {Counter(y_res)}")
print(f"Testing target statistics: {Counter(y_test)}")

Training target statistics: Counter({0: 918984, 1: 918984})
Testing target statistics: Counter({0: 306328, 1: 20203})


In [123]:
X_res = np.array(X_res).flatten()

In [125]:
vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,2))
tfidf_model = vectorizer.fit(X_res)
train_tfidf = tfidf_model.transform(X_res)
test_tfidf = tfidf_model.transform(X_test)

In [132]:
clf = RandomForestClassifier(max_depth= 20, min_samples_leaf= 3, min_samples_split= 3, n_estimators= 1200)
clf.fit(train_tfidf, y_res)

RandomForestClassifier(max_depth=20, min_samples_leaf=3, min_samples_split=3,
                       n_estimators=1200)

In [133]:
preds = clf.predict(test_tfidf)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.98      0.88      0.93    306328
           1       0.29      0.77      0.42     20203

    accuracy                           0.87    326531
   macro avg       0.64      0.82      0.67    326531
weighted avg       0.94      0.87      0.90    326531



In [146]:
train_balanced["target"].value_counts()

1    80810
0    80810
Name: target, dtype: int64

TF-IDF + UNDERSAMPLED + RANDOM FOREST

In [147]:
train_balanced = questions[questions['target'] == 1]
train_balanced = train_balanced.append(questions[questions['target'] == 0].sample(n = len(train_balanced))).reset_index(drop = True)
train_balanced['target'].value_counts()

balanced_text = train_balanced["clean_tokenized"].values.astype('U')

train_text, test_text, target_train, target_test = train_test_split(balanced_text, train_balanced["target"], stratify=train_balanced["target"], test_size=0.25)


In [148]:
vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,2))
tfidf_model = vectorizer.fit(train_text)
train_tfidf = tfidf_model.transform(train_text)
test_tfidf = tfidf_model.transform(test_text)

In [166]:
clf = RandomForestClassifier(max_depth= 20, min_samples_leaf= 3,
                         min_samples_split= 3, n_estimators= 1200)
clf.fit(train_tfidf, target_train)

RandomForestClassifier(max_depth=20, min_samples_leaf=3, min_samples_split=3,
                       n_estimators=1200)

In [169]:
preds = clf.predict(test_tfidf)
print(classification_report(target_test, preds))

In [154]:
print(classification_report(target_test, preds))

              precision    recall  f1-score   support

           0       0.79      0.87      0.83     20203
           1       0.86      0.77      0.81     20202

    accuracy                           0.82     40405
   macro avg       0.82      0.82      0.82     40405
weighted avg       0.82      0.82      0.82     40405



In [170]:
clf = RandomForestClassifier(max_depth= 20, min_samples_leaf= 3,
                         min_samples_split= 3, n_estimators= 1000)
clf.fit(train_tfidf, target_train)

preds = clf.predict(test_tfidf)
print(classification_report(target_test, preds))

              precision    recall  f1-score   support

           0       0.79      0.86      0.83     20203
           1       0.85      0.77      0.81     20202

    accuracy                           0.82     40405
   macro avg       0.82      0.82      0.82     40405
weighted avg       0.82      0.82      0.82     40405



In [173]:
clf = RandomForestClassifier(max_depth= 20, min_samples_leaf= 3,
                         min_samples_split= 3, n_estimators= 1000, max_features="log2")
clf.fit(train_tfidf, target_train)

preds = clf.predict(test_tfidf)
print(classification_report(target_test, preds))

              precision    recall  f1-score   support

           0       0.73      0.87      0.79     20203
           1       0.84      0.68      0.75     20202

    accuracy                           0.77     40405
   macro avg       0.78      0.77      0.77     40405
weighted avg       0.78      0.77      0.77     40405



In [174]:
clf = RandomForestClassifier(max_depth= 20, min_samples_leaf= 3,
                         min_samples_split= 3, n_estimators= 1000, max_features="sqrt")
clf.fit(train_tfidf, target_train)

preds = clf.predict(test_tfidf)
print(classification_report(target_test, preds))

              precision    recall  f1-score   support

           0       0.79      0.87      0.83     20203
           1       0.86      0.77      0.81     20202

    accuracy                           0.82     40405
   macro avg       0.82      0.82      0.82     40405
weighted avg       0.82      0.82      0.82     40405



In [175]:
clf = RandomForestClassifier(max_depth= 20, min_samples_leaf= 3,
                         min_samples_split= 3, n_estimators= 2000, max_features="sqrt")
clf.fit(train_tfidf, target_train)

preds = clf.predict(test_tfidf)
print(classification_report(target_test, preds))

              precision    recall  f1-score   support

           0       0.79      0.87      0.83     20203
           1       0.85      0.77      0.81     20202

    accuracy                           0.82     40405
   macro avg       0.82      0.82      0.82     40405
weighted avg       0.82      0.82      0.82     40405



In [176]:
clf = RandomForestClassifier(max_depth= 20, min_samples_leaf= 3,
                         min_samples_split= 3, n_estimators= 2000, max_features="auto")
clf.fit(train_tfidf, target_train)

preds = clf.predict(test_tfidf)
print(classification_report(target_test, preds))

              precision    recall  f1-score   support

           0       0.79      0.87      0.83     20203
           1       0.85      0.77      0.81     20202

    accuracy                           0.82     40405
   macro avg       0.82      0.82      0.82     40405
weighted avg       0.82      0.82      0.82     40405



In [178]:
clf = RandomForestClassifier(max_depth= 30, min_samples_leaf= 3,
                         min_samples_split= 3, n_estimators= 2000, max_features="auto")
clf.fit(train_tfidf, target_train)

preds = clf.predict(test_tfidf)
print(classification_report(target_test, preds))

              precision    recall  f1-score   support

           0       0.79      0.87      0.83     20203
           1       0.85      0.78      0.81     20202

    accuracy                           0.82     40405
   macro avg       0.82      0.82      0.82     40405
weighted avg       0.82      0.82      0.82     40405



In [179]:
clf = RandomForestClassifier(max_depth= 30, min_samples_leaf= 3,
                         min_samples_split= 5, n_estimators= 2000, max_features="auto")
clf.fit(train_tfidf, target_train)

preds = clf.predict(test_tfidf)
print(classification_report(target_test, preds))

              precision    recall  f1-score   support

           0       0.80      0.87      0.83     20203
           1       0.85      0.78      0.81     20202

    accuracy                           0.82     40405
   macro avg       0.83      0.82      0.82     40405
weighted avg       0.83      0.82      0.82     40405



In [184]:
clf = RandomForestClassifier(max_depth= 30, min_samples_leaf= 3,
                         min_samples_split= 5, n_estimators= 700, max_features="auto")
clf.fit(train_tfidf, target_train)

preds = clf.predict(test_tfidf)
print(classification_report(target_test, preds))

              precision    recall  f1-score   support

           0       0.80      0.86      0.83     20203
           1       0.85      0.78      0.81     20202

    accuracy                           0.82     40405
   macro avg       0.82      0.82      0.82     40405
weighted avg       0.82      0.82      0.82     40405



In [185]:
clf = RandomForestClassifier(max_depth= 30, min_samples_leaf= 3,
                         min_samples_split= 5, n_estimators= 7000, max_features="auto")
clf.fit(train_tfidf, target_train)

preds = clf.predict(test_tfidf)
print(classification_report(target_test, preds))

              precision    recall  f1-score   support

           0       0.80      0.87      0.83     20203
           1       0.85      0.78      0.81     20202

    accuracy                           0.82     40405
   macro avg       0.83      0.82      0.82     40405
weighted avg       0.83      0.82      0.82     40405



In [5]:
questions["question_text"]

Unnamed: 0,qid,question_text,target,tokenized,length,clean_tokenized
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0,"['How', 'did', 'Quebec', 'nationalists', 'see'...",14,how did quebec nationalist see their provinc a...
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0,"['Do', 'you', 'have', 'an', 'adopted', 'dog', ...",18,do you have an adopt dog how would you encoura...
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0,"['Why', 'does', 'velocity', 'affect', 'time', ...",12,whi doe veloc affect time ? doe veloc affect s...
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0,"['How', 'did', 'Otto', 'von', 'Guericke', 'use...",10,how did otto von guerick use the magdeburg hem...
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0,"['Can', 'I', 'convert', 'montra', 'helicon', '...",16,can i convert montra helicon d to a mountain b...
...,...,...,...,...,...,...
1306117,ffffcc4e2331aaf1e41e,What other technical skills do you need as a c...,0,"['What', 'other', 'technical', 'skills', 'do',...",18,what other technic skill do you need as a comp...
1306118,ffffd431801e5a2f4861,Does MS in ECE have good job prospects in USA ...,0,"['Does', 'MS', 'in', 'ECE', 'have', 'good', 'j...",20,doe ms in ece have good job prospect in usa or...
1306119,ffffd48fb36b63db010c,Is foam insulation toxic?,0,"['Is', 'foam', 'insulation', 'toxic', '?']",5,is foam insul toxic ?
1306120,ffffec519fa37cf60c78,How can one start a research project based on ...,0,"['How', 'can', 'one', 'start', 'a', 'research'...",14,how can one start a research project base on b...


In [214]:
errors = np.array([test_text, target_test, preds])
errors_df = pd.DataFrame(errors.transpose(), columns = ['text', 'label', 'preds'])

In [233]:
errors_analysis_df = errors_df[errors_df["label"] != errors_df["preds"]]

In [240]:
pd.set_option('display.max_colwidth', None)
errors_analysis_df.sample(50)

Unnamed: 0,text,label,preds
21713,whi do some peopl place so much valu on athlet competit ? for instanc neither my mom nor my dad turn off sport tv until bedtim it as if life revolv around these competit,0,1
16936,can an atheist tell me how badli charl darwin is suffer in hell right now ?,1,0
2868,are mexican emot peopl ?,1,0
29959,what can societi do to circumv or at least mitig the impend financi crisi ? have we not learn enough from the 2008 crisi to prevent similar event in the present or nearterm futur ?,0,1
5272,the quora moder ha declar war against you and all your follow ! how do you proceed ?,1,0
5755,what are the best custom weed label ?,1,0
31370,whi tataski ha onli block doordarshan while cricket match broadcast ? now in india do we need to pay for watch doordarshan as well ?,0,1
29921,my daughter refus to let me kiss her on the cheek what should i do ?,1,0
14884,whi do doctor alway lie about the actual level of pain and side effect involv in mani medic test and procedur ?,0,1
25712,is everyon in the seattl area drive over the speed limit of 60 ?,1,0


In [239]:
pd.set_option('display.max_colwidth', None)
errors_df[errors_df["label"] == errors_df["preds"]].sample(40)

Unnamed: 0,text,label,preds
11611,whi do american liber are so attach to free educ when it a veri bad idea ?,1,1
28011,should indian invad pakistan bangladesh and nepal becaus we are so superpow ?,1,1
35618,whi is islam a violent religion ? whi all terrorist are all from islam onli ? whi isnt islam peac like buddhism hinduism jainism sikhism ?,1,1
8112,what would you do if you found out that one of your parent is cheat ?,0,0
31373,what are some of the interest fact about shahrukh khan next movi the ring ?,0,0
20569,how can an american citizen help presid trump make the mexican govern pay for the wall ?,1,1
9421,how can you compar the sun with other star ?,0,0
35118,what is the earliest point in a game of kerbal space program where you are abl to get a spacecraft to eeloo ?,0,0
27907,what kind of question can one encount in a written interview for a broadcast journalist job with an intern media organis ?,0,0
21160,what are the area where we use graphic design ?,0,0


Custom Word2Vec + Random Forest Model

Create Vectors from Quora Dataset

Random Forest

Hyperparameter tuning

In [171]:
from sklearn.model_selection import GridSearchCV

param_grid = { 
    'n_estimators': [200],
    'max_features': ['auto', 'sqrt', 'log2']
}

CV_rfc = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 5)
CV_rfc.fit(train_tfidf, target_train)

print(CV_rfc.best_params_)

Error analysis

In [None]:
test_tfidf convert back into regular vectors and see what samples they were 
stemmed and tokenized 
and regular 

then compare the difference 