## Exporting the best models from Experimentation Notebook as pickle files

### Best Models are 
### 1) AdaBoost Implementing Hyper Parameter Tuning on Sampled Data
### 2) Extra Trees on Sampled Data
### 3) Bernoulli NB Implementing on Sampled Data
### 4) Gradient Boosting Implementing Hyper Parameter Tuning on Sampled Data
### 5) Random Forest Implementing Hyper Parameter Tuning on Sampled Data

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
import pickle

Using TensorFlow backend.


In [2]:
train = pd.read_csv('train.csv')
train.dropna(inplace = True)

vectorizer = TfidfVectorizer()

X = train['TEXT']
y = train['truth']

vectors_train = vectorizer.fit_transform(X)

rus = RandomUnderSampler()

samplex, sampley = rus.fit_resample(vectors_train, y)

In [3]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
clf = RandomForestClassifier()
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, scoring='accuracy')

grid_search.fit(samplex, sampley)
best_grid = grid_search.best_estimator_
pickle.dump(best_grid, open('RandomForest.sav', 'wb'))

In [4]:
clf = ExtraTreesClassifier()
clf.fit(samplex, sampley)
pickle.dump(clf, open('Extra-Trees.sav', 'wb'))

In [5]:
param_grid = {
    'n_estimators': [100, 200, 300, 1000],
    'learning_rate':[.001,0.01,.1],
    'loss': ['deviance', 'exponential'],
}
clf = GradientBoostingClassifier()
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, scoring='accuracy')

grid_search.fit(samplex, sampley)
best_grid = grid_search.best_estimator_
pickle.dump(best_grid, open('Gradient-Boost.sav', 'wb'))

In [6]:
clf = BernoulliNB()
clf.fit(samplex, sampley)
pickle.dump(clf, open('BernoulliNB.sav', 'wb'))