In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
import pickle
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB

Using TensorFlow backend.


In [10]:
train = pd.read_csv('train.csv')
train.dropna(inplace = True)

vectorizer = TfidfVectorizer()

X = train['TEXT']
y = train['truth']

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.1, shuffle=False)

vectors_train = vectorizer.fit_transform(xtrain)
test_vectors = vectorizer.transform(xtest)

In [13]:
rus = RandomUnderSampler()
samplex, sampley = rus.fit_resample(vectors_train, ytrain)

## Random Forest

In [4]:
clf = RandomForestClassifier()
clf.fit(vectors_train, ytrain)
predictions = clf.predict(test_vectors)

print(classification_report(predictions, ytest))

clf = RandomForestClassifier()
clf.fit(samplex, sampley)
predictions = clf.predict(test_vectors)

print(classification_report(predictions, ytest))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97     12530
           1       0.08      0.58      0.14        93

    accuracy                           0.95     12623
   macro avg       0.54      0.76      0.55     12623
weighted avg       0.99      0.95      0.97     12623

              precision    recall  f1-score   support

           0       0.71      0.97      0.82      8695
           1       0.59      0.11      0.18      3928

    accuracy                           0.70     12623
   macro avg       0.65      0.54      0.50     12623
weighted avg       0.67      0.70      0.62     12623



### Implementing Hyper parameter Tuning for Undersampled Data model

In [18]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
clf = RandomForestClassifier()
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, scoring='accuracy')

grid_search.fit(samplex, sampley)

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [19]:
best_grid = grid_search.best_estimator_
predictions = best_grid.predict(test_vectors)
print(classification_report(predictions, ytest))

              precision    recall  f1-score   support

           0       0.71      0.97      0.82      8766
           1       0.55      0.09      0.16      3857

    accuracy                           0.70     12623
   macro avg       0.63      0.53      0.49     12623
weighted avg       0.66      0.70      0.62     12623



In [20]:
pickle.dump(best_grid, open('RandomForest.sav', 'wb'))

## AdaBoost

In [5]:
clf = AdaBoostClassifier()
clf.fit(vectors_train, ytrain)
predictions = clf.predict(test_vectors)

print(classification_report(predictions, ytest))

clf = AdaBoostClassifier()

clf.fit(samplex, sampley)
predictions = clf.predict(test_vectors)

print(classification_report(predictions, ytest))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97     12561
           1       0.05      0.60      0.10        62

    accuracy                           0.95     12623
   macro avg       0.53      0.77      0.53     12623
weighted avg       0.99      0.95      0.97     12623

              precision    recall  f1-score   support

           0       0.83      0.96      0.89     10392
           1       0.35      0.11      0.17      2231

    accuracy                           0.81     12623
   macro avg       0.59      0.53      0.53     12623
weighted avg       0.75      0.81      0.76     12623



### Implementing Hyper Parameter Tuning for Under Sampled Data

In [21]:
param_grid = {'n_estimators':[100, 200, 300, 1000],
               'learning_rate':[.001,0.01,.1]}

clf = AdaBoostClassifier()
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, scoring='accuracy')

grid_search.fit(samplex, sampley)

GridSearchCV(cv=3, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=None),
             iid='deprecated', n_jobs=-1,
             param_grid={'learning_rate': [0.001, 0.01, 0.1],
                         'n_estimators': [100, 200, 300, 1000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [22]:
best_grid = grid_search.best_estimator_
predictions = best_grid.predict(test_vectors)
print(classification_report(predictions, ytest))

              precision    recall  f1-score   support

           0       0.76      0.97      0.85      9437
           1       0.50      0.10      0.17      3186

    accuracy                           0.75     12623
   macro avg       0.63      0.53      0.51     12623
weighted avg       0.69      0.75      0.68     12623



In [23]:
pickle.dump(best_grid, open('AdaBoost.sav', 'wb'))

## Decision Tree

In [6]:
clf = DecisionTreeClassifier()
clf.fit(vectors_train, ytrain)
predictions = clf.predict(test_vectors)

print(classification_report(predictions, ytest))

clf = DecisionTreeClassifier()

clf.fit(samplex, sampley)
predictions = clf.predict(test_vectors)

print(classification_report(predictions, ytest))

              precision    recall  f1-score   support

           0       0.98      0.95      0.96     12208
           1       0.18      0.30      0.22       415

    accuracy                           0.93     12623
   macro avg       0.58      0.63      0.59     12623
weighted avg       0.95      0.93      0.94     12623

              precision    recall  f1-score   support

           0       0.62      0.96      0.75      7663
           1       0.59      0.08      0.15      4960

    accuracy                           0.62     12623
   macro avg       0.60      0.52      0.45     12623
weighted avg       0.61      0.62      0.51     12623



### Hyper Parameter Tuning for Not Sampled Data

In [35]:
param_grid = {
    'max_depth': [80, 90, 100, 110],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
}
clf = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, scoring='accuracy')

grid_search.fit(vectors_train, ytrain)

GridSearchCV(cv=3, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'max_depth': [80, 90, 100, 110],
                      

In [36]:
best_grid = grid_search.best_estimator_
predictions = best_grid.predict(test_vectors)
print(classification_report(predictions, ytest))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97     12491
           1       0.09      0.50      0.16       132

    accuracy                           0.94     12623
   macro avg       0.54      0.72      0.56     12623
weighted avg       0.99      0.94      0.96     12623



## Extra Trees 

In [7]:
clf = ExtraTreesClassifier()
clf.fit(vectors_train, ytrain)
predictions = clf.predict(test_vectors)

print(classification_report(predictions, ytest))

clf = ExtraTreesClassifier()

clf.fit(samplex, sampley)
predictions = clf.predict(test_vectors)

print(classification_report(predictions, ytest))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97     12475
           1       0.11      0.51      0.18       148

    accuracy                           0.94     12623
   macro avg       0.55      0.73      0.57     12623
weighted avg       0.98      0.94      0.96     12623

              precision    recall  f1-score   support

           0       0.70      0.97      0.81      8573
           1       0.64      0.11      0.19      4050

    accuracy                           0.69     12623
   macro avg       0.67      0.54      0.50     12623
weighted avg       0.68      0.69      0.61     12623



#### Saving Best Model

In [24]:
clf = ExtraTreesClassifier()
clf.fit(samplex, sampley)
predictions = clf.predict(test_vectors)
print(classification_report(predictions, ytest))
pickle.dump(best_grid, open('Extra-Trees.sav', 'wb'))

              precision    recall  f1-score   support

           0       0.68      0.97      0.80      8439
           1       0.63      0.10      0.17      4184

    accuracy                           0.68     12623
   macro avg       0.66      0.54      0.49     12623
weighted avg       0.67      0.68      0.59     12623



### Hyper Parameter Tuning for UnderSampled Data

In [37]:
param_grid = {
    'max_depth': [80, 90, 100, 110],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12]
}
clf = ExtraTreesClassifier()
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, scoring='accuracy')

grid_search.fit(samplex, sampley)

GridSearchCV(cv=3, error_score=nan,
             estimator=ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                            class_weight=None, criterion='gini',
                                            max_depth=None, max_features='auto',
                                            max_leaf_nodes=None,
                                            max_samples=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=100, n_jobs=None,
                                            oob_score=False, random_state=None,
                                            verbose=0, warm_start=False),
             iid='deprecate

In [38]:
best_grid = grid_search.best_estimator_
predictions = best_grid.predict(test_vectors)
print(classification_report(predictions, ytest))

              precision    recall  f1-score   support

           0       0.77      0.97      0.86      9521
           1       0.53      0.12      0.20      3102

    accuracy                           0.76     12623
   macro avg       0.65      0.54      0.53     12623
weighted avg       0.71      0.76      0.69     12623



## Gradient Boosting

In [8]:
clf = GradientBoostingClassifier()
clf.fit(vectors_train, ytrain)
predictions = clf.predict(test_vectors)

print(classification_report(predictions, ytest))

clf = GradientBoostingClassifier()

clf.fit(samplex, sampley)
predictions = clf.predict(test_vectors)

print(classification_report(predictions, ytest))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97     12590
           1       0.04      0.85      0.08        33

    accuracy                           0.95     12623
   macro avg       0.52      0.90      0.52     12623
weighted avg       1.00      0.95      0.97     12623

              precision    recall  f1-score   support

           0       0.80      0.96      0.87      9957
           1       0.41      0.11      0.17      2666

    accuracy                           0.78     12623
   macro avg       0.61      0.53      0.52     12623
weighted avg       0.72      0.78      0.72     12623



### Hyper Parameter Tuning for Under Sampled Data

In [25]:
param_grid = {
    'n_estimators': [100, 200, 300, 1000],
    'learning_rate':[.001,0.01,.1],
    'loss': ['deviance', 'exponential'],
}
clf = GradientBoostingClassifier()
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, scoring='accuracy')

grid_search.fit(samplex, sampley)

GridSearchCV(cv=3, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_c...one,
             

In [26]:
best_grid = grid_search.best_estimator_
predictions = best_grid.predict(test_vectors)
print(classification_report(predictions, ytest))

              precision    recall  f1-score   support

           0       0.72      0.97      0.83      8925
           1       0.55      0.10      0.17      3698

    accuracy                           0.71     12623
   macro avg       0.64      0.53      0.50     12623
weighted avg       0.67      0.71      0.63     12623



In [27]:
pickle.dump(best_grid, open('Gradient-Boost.sav', 'wb'))

## Bernoulli

In [10]:
clf = BernoulliNB()
clf.fit(vectors_train, ytrain)
predictions = clf.predict(test_vectors)

print(classification_report(predictions, ytest))

clf = BernoulliNB()

clf.fit(samplex, sampley)
predictions = clf.predict(test_vectors)

print(classification_report(predictions, ytest))

              precision    recall  f1-score   support

           0       0.98      0.95      0.97     12341
           1       0.10      0.25      0.14       282

    accuracy                           0.93     12623
   macro avg       0.54      0.60      0.55     12623
weighted avg       0.96      0.93      0.95     12623

              precision    recall  f1-score   support

           0       0.77      0.97      0.85      9459
           1       0.54      0.12      0.20      3164

    accuracy                           0.75     12623
   macro avg       0.65      0.54      0.53     12623
weighted avg       0.71      0.75      0.69     12623



### Hyper Parameter Tuning For Under Sampled Data

In [15]:
param_grid = {
    'alpha': [1, 0],
    'fit_prior' : [True, False]
}
clf = BernoulliNB()
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, scoring='accuracy')

grid_search.fit(samplex, sampley)

  'setting alpha = %.1e' % _ALPHA_MIN)


GridSearchCV(cv=3, error_score=nan,
             estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                                   fit_prior=True),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': [1, 0], 'fit_prior': [True, False]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [16]:
best_grid = grid_search.best_estimator_
predictions = best_grid.predict(test_vectors)
print(classification_report(predictions, ytest))

              precision    recall  f1-score   support

           0       0.76      0.97      0.85      9406
           1       0.54      0.11      0.18      3217

    accuracy                           0.75     12623
   macro avg       0.65      0.54      0.52     12623
weighted avg       0.70      0.75      0.68     12623



In [17]:
pickle.dump(best_grid, open('BernoulliNB.sav', 'wb'))

## KNN 

In [14]:
clf = KNeighborsClassifier()
clf.fit(vectors_train, ytrain)
predictions = clf.predict(test_vectors)

print(classification_report(predictions, ytest))

clf = KNeighborsClassifier()

clf.fit(samplex, sampley)
predictions = clf.predict(test_vectors)

print(classification_report(predictions, ytest))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97     12603
           1       0.02      0.55      0.03        20

    accuracy                           0.94     12623
   macro avg       0.51      0.75      0.50     12623
weighted avg       1.00      0.94      0.97     12623

              precision    recall  f1-score   support

           0       0.78      0.95      0.85      9818
           1       0.25      0.06      0.10      2805

    accuracy                           0.75     12623
   macro avg       0.51      0.50      0.48     12623
weighted avg       0.66      0.75      0.69     12623



### Best Models are 
### 1) AdaBoost Implementing Hyper Parameter Tuning on Sampled Data
### 2) Extra Trees on Sampled Data
### 3) Bernoulli NB Implementing Hyper Parameter Tuning on Un-Sampled Data
### 4) Gradient Boosting Implementing Hyper Parameter Tuning on Sampled Data
### 5) Random Forest Implementing Hyper Parameter Tuning on Sampled Data