In [None]:
import pandas as pd
from sqlalchemy import create_engine
import nltk
from nltk.tokenize import RegexpTokenizer
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import time 
from nltk.stem.snowball import SnowballStemmer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer 
import string
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, auc
from sklearn.metrics import roc_curve, make_scorer, precision_score, recall_score, f1_score
from sklearn import tree
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.model_selection import KFold
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from matplotlib.legend_handler import HandlerLine2D 
import scipy
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from vecstack import stacking
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [None]:
stopwords = set(['ourselves', 'between', 'but', 'again','there', 'about', 'once', \
                 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an',\
                 'be', 'some', 'for', 'do', 'its', 'such', 'into', 'of', 'most', 
                 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from',\
                 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your',\
                 'through', 'don', 'nor', 'me', 'were', 'more',\
                 'this', 'down', 'should', 'our', 'their', 'while', 'above', \
                 'both', 'up', 'to', 'ours', 'had', 'all', 'no', 'when'\
                 , 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', \
                 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', \
                 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', \
                 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those'\
                 , 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against',\
                 'a', 'by', 'doing', 'it', 'how','b','the', 'you', 'further', 'href', \
                 'was', 'here', 'than','you'])

In [None]:
engine = create_engine('postgresql://postgres:stat170@postgres/stat170a', client_encoding='utf8', pool_pre_ping=True)

In [None]:
bech = pd.read_sql_query('SELECT * from bechdal_test;',con=engine)

In [None]:
scripts1 = pd.read_sql_query('SELECT * from scripts_final;',con=engine)

In [None]:
s1_t = scripts1['name']
s1_t = s1_t.replace("[\s]+", " ", regex=True).str.strip()
scripts1['name'] = s1_t

In [None]:
d = pd.merge(scripts1,bech, left_on = 'name', right_on='title')

In [None]:
d = d[d['convo'].apply(lambda x: len(x.split(' ')) > 100)]

In [None]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
stemmer = SnowballStemmer('english')

In [None]:
def clean(data):
    data = data.strip().lower()
    return data.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))

In [None]:
def tokenize(data):
    return stemmer.stem(tokenizer.tokenize(data))

In [None]:
d['convo'] = d['convo'].apply(clean)
d['info'] = d['info'].apply(clean)
d['bechdal_int'] = d['bechdal'].astype(int)
d['text_info'] = d[['convo', 'info']].apply(lambda x: ''.join(x), axis=1)

d.to_cdv("d")

In [None]:
scorer={'accuracy': make_scorer(accuracy_score),
        'f1_score': make_scorer(f1_score),
        'precision': make_scorer(precision_score, pos_label=1, average='binary'),
        'recall': make_scorer(recall_score), 
        'mean_squared_error': make_scorer(mean_squared_error)}

In [None]:
X_train, X_test, y_train, y_test= train_test_split(d.loc[:, 'text_info'],\
                                                       d.loc[:, 'bechdal_int'], test_size = 0.2)
v2 = TfidfVectorizer(analyzer='word',use_idf=True, ngram_range=(2,2), \
                         token_pattern= r'\w{1,}', stop_words=stopwords, sublinear_tf=True)
train_X = v2.fit_transform(X_train)
test_X = v2.transform(X_test)

In [None]:
#Random Forest

In [None]:
parameters = {'n_estimators' : [10, 20, 50,100,150,200]}
rf_clf = RandomForestClassifier(max_depth=4,
                               random_state=0)
rf_gs_clf = GridSearchCV(rf_clf, parameters, cv=5,\
                      return_train_score=True, \
                      scoring=scorer,refit='mean_squared_error' )
rf_gs_clf.fit(train_X, y_train)
rf_gs_y_pred = rf_gs_clf.predict(test_X)
rf_grid_search_results = rf_gs_clf.cv_results_.keys()

In [None]:
rf_grid_search_results_with_scorer = rf_gs_clf.cv_results_

In [None]:
rf_gs_test = [x for x in rf_grid_search_results_with_scorer['mean_test_mean_squared_error']]
rf_gs_tr = [x for x in rf_grid_search_results_with_scorer['mean_train_mean_squared_error']]
line1, = plt.plot(rf_grid_search_results_with_scorer['param_n_estimators'].data, rf_gs_tr, 'purple', label="Train MSE")
line2, = plt.plot(rf_grid_search_results_with_scorer['param_n_estimators'].data, rf_gs_test, 'pink', label="Test MSE")
plt.rcParams.update({'font.size': 12})
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.rcParams.update({'font.size': 26})
plt.ylabel('MSE')
plt.xlabel('N_estimators of Random Forest')
plt.title("Random Forest with GridSearch: MSE for different N estimators")
plt.show()

In [None]:
rf_gs_test_acc = [x for x in rf_grid_search_results_with_scorer['mean_test_accuracy']]
rf_gs_tr_acc = [x for x in rf_grid_search_results_with_scorer['mean_train_accuracy']]
line1, = plt.plot(rf_grid_search_results_with_scorer['param_n_estimators'].data, rf_gs_tr_acc, 'purple', label="Train MSE")
line2, = plt.plot(rf_grid_search_results_with_scorer['param_n_estimators'].data, rf_gs_test_acc, 'pink', label="Test MSE")
plt.rcParams.update({'font.size': 12})
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.rcParams.update({'font.size': 26})
plt.ylabel('Accuracy')
plt.xlabel('Accuracy of Random Forest')
plt.title("Random Forest with GridSearch: Accuracy for different N estimators")
plt.show()

#D Tree Depth 2 was best

In [None]:
parameters = {'max_depth' : range(2,10)}
dt_clf = tree.DecisionTreeClassifier()
dt_gs_clf = GridSearchCV(dt_clf, parameters, cv=5,\
                      return_train_score=True, \
                      scoring=scorer,refit='mean_squared_error' )
dt_gs_clf.fit(train_X, y_train)
dt_gs_y_pred = dt_gs_clf.predict(test_X)
dt_grid_search_results = dt_gs_clf.cv_results_.keys()

In [None]:
dt_grid_search_results_with_scorer = dt_gs_clf.cv_results_

In [None]:
dt_gs_test = [x for x in dt_grid_search_results_with_scorer['mean_test_mean_squared_error']]
dt_gs_tr = [x for x in dt_grid_search_results_with_scorer['mean_train_mean_squared_error']]
line1, = plt.plot(dt_grid_search_results_with_scorer['param_max_depth'].data, dt_gs_tr, 'purple', label="Train MSE")
line2, = plt.plot(dt_grid_search_results_with_scorer['param_max_depth'].data, dt_gs_test, 'pink', label="Test MSE")
plt.rcParams.update({'font.size': 12})
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.rcParams.update({'font.size': 26})
plt.ylabel('MSE')
plt.xlabel('Depth of Tree')
plt.title("Descision Tree with GridSearch: MSE for different Depth Values")
plt.show()

In [None]:
dt_gs_test_acc = [x for x in dt_grid_search_results_with_scorer['mean_test_accuracy']]
dt_gs_tr_acc = [x for x in dt_grid_search_results_with_scorer['mean_train_accuracy']]
line1, = plt.plot(dt_grid_search_results_with_scorer['param_max_depth'].data, dt_gs_tr_acc, 'purple', label="Train Accuracy")
line2, = plt.plot(dt_grid_search_results_with_scorer['param_max_depth'].data, dt_gs_test_acc, 'pink', label="Test Accuracy")
plt.rcParams.update({'font.size': 12})
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.rcParams.update({'font.size': 26})
plt.ylabel('Accuracy')
plt.xlabel('Depth of Tree')
plt.title("Descision Tree with GridSearch: Accuracy for different Depth Values")
plt.show()

dt_gs_test = [x for x in grid_search_results_with_scorer['mean_test_mean_squared_error']]
dt_gs_tr = [x for x in grid_search_results_with_scorer['mean_train_mean_squared_error']]
line1, = plt.plot(grid_search_results_with_scorer['param_max_depth'].data, dt_gs_tr, 'purple', label="Train MSE")
line2, = plt.plot(grid_search_results_with_scorer['param_max_depth'].data, dt_gs_test, 'pink', label="Test MSE")
plt.rcParams.update({'font.size': 12})
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.rcParams.update({'font.size': 26})
plt.ylabel('MSE')
plt.xlabel('Depth of Tree')
plt.title("Descision Tree with GridSearch: MSE for different Depth Values")
plt.show()

In [None]:
parameters = {'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
lr_clf =  LogisticRegression()
lr_gs_clf = GridSearchCV(lr_clf, parameters, cv=5,\
                      return_train_score=True, \
                      scoring=scorer,refit='mean_squared_error' )
lr_gs_clf.fit(train_X, y_train)
lr_gs_y_pred = lr_gs_clf.predict(test_X)
lr_grid_search_results = lr_gs_clf.cv_results_.keys()

In [None]:
lr_grid_search_results_with_scorer = lr_gs_clf.cv_results_

In [None]:
lr_gs_test = [x for x in lr_grid_search_results_with_scorer['mean_test_mean_squared_error']]
lr_gs_tr = [x for x in lr_grid_search_results_with_scorer['mean_train_mean_squared_error']]
line1, = plt.plot(lr_grid_search_results_with_scorer['param_solver'].data, lr_gs_tr, 'purple', label="Train MSE")
line2, = plt.plot(lr_grid_search_results_with_scorer['param_solver'].data, lr_gs_test, 'pink', label="Test MSE")
plt.rcParams.update({'font.size': 12})
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.rcParams.update({'font.size': 26})
plt.ylabel('MSE')
plt.xlabel('Solver')
plt.title("Logistic Regression with GridSearch: MSE for different Solvers")
plt.show()

In [None]:
lr_gs_test_acc = [x for x in lr_grid_search_results_with_scorer['mean_test_accuracy']]
lr_gs_tr_acc = [x for x in lr_grid_search_results_with_scorer['mean_train_accuracy']]
line1, = plt.plot(lr_grid_search_results_with_scorer['param_solver'].data, lr_gs_tr, 'purple', label="Train Accuracy")
line2, = plt.plot(lr_grid_search_results_with_scorer['param_solver'].data, lr_gs_test, 'pink', label="Test Accuracy")
plt.rcParams.update({'font.size': 12})
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.rcParams.update({'font.size': 26})
plt.ylabel('Accuracy')
plt.xlabel('Solver')
plt.title("Logistic Regression with GridSearch: Accuracy for different Solvers")
plt.show()

# Naive Bayes

In [None]:
parameters = {'alpha' : [0.001,0.01,0.1,.4,.5,.75,1,2]}
nb_clf =  MultinomialNB()
nb_gs_clf = GridSearchCV(nb_clf, parameters, cv=5,\
                      return_train_score=True, \
                      scoring=scorer,refit='mean_squared_error' )
nb_gs_clf.fit(train_X, y_train)
nb_gs_y_pred = nb_gs_clf.predict(test_X)
nb_grid_search_results = nb_gs_clf.cv_results_.keys()

In [None]:
nb_grid_search_results_with_scorer = nb_gs_clf.cv_results_

In [None]:
nb_gs_test = [x for x in nb_grid_search_results_with_scorer['mean_test_mean_squared_error']]
nb_gs_tr = [x for x in nb_grid_search_results_with_scorer['mean_train_mean_squared_error']]
line1, = plt.plot(nb_grid_search_results_with_scorer['param_alpha'].data, nb_gs_tr, 'purple', label="Train MSE")
line2, = plt.plot(nb_grid_search_results_with_scorer['param_alpha'].data, nb_gs_test, 'pink', label="Test MSE")
plt.rcParams.update({'font.size': 12})
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.rcParams.update({'font.size': 26})
plt.ylabel('MSE')
plt.xlabel('Alpha')
plt.title("Naive Bayes with GridSearch: MSE for different Alphas")
plt.show()

In [None]:
nb_gs_test_acc = [x for x in nb_grid_search_results_with_scorer['mean_test_accuracy']]
nb_gs_tr_acc = [x for x in nb_grid_search_results_with_scorer['mean_train_accuracy']]
line1, = plt.plot(nb_grid_search_results_with_scorer['param_alpha'].data, nb_gs_tr, 'purple', label="Train Accuray")
line2, = plt.plot(nb_grid_search_results_with_scorer['param_alpha'].data, nb_gs_test, 'pink', label="Test Accuray")
plt.rcParams.update({'font.size': 12})
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.rcParams.update({'font.size': 26})
plt.ylabel('Accuray')
plt.xlabel('Alpha')
plt.title("Naive Bayes with GridSearch: Accuray for different Alphas")
plt.show()

# Ada boost 

In [None]:
ad_dt_clf = tree.DecisionTreeClassifier(max_depth=2)
parameters = { 'n_estimators': [10,20,30,40,50,60,70]}
ab_clf =  AdaBoostClassifier(ad_dt_clf)
ab_gs_clf = GridSearchCV(ab_clf, parameters, cv=5,\
                      return_train_score=True, \
                      scoring=scorer,refit='mean_squared_error' )
ab_gs_clf.fit(train_X, y_train)
ab_gs_y_pred = ab_gs_clf.predict(test_X)
ab_grid_search_results = ab_gs_clf.cv_results_.keys()

In [None]:
ab_grid_search_results_with_scorer = ab_gs_clf.cv_results_

In [None]:
ab_gs_test = [x for x in ab_grid_search_results_with_scorer['mean_test_mean_squared_error']]
ab_gs_tr = [x for x in ab_grid_search_results_with_scorer['mean_train_mean_squared_error']]
ab_gs_lab = ab_grid_search_results_with_scorer['param_n_estimators'].data
line1, = plt.plot(ab_gs_lab , ab_gs_tr, 'purple', label="Train MSE")
line2, = plt.plot(ab_gs_lab, ab_gs_test, 'pink', label="Test MSE")
plt.rcParams.update({'font.size': 12})
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.rcParams.update({'font.size': 26})
plt.ylabel('MSE')
plt.xlabel('n estimators')
plt.title("AdaBoost with GridSearch: MSE for different N estimators")
plt.show()

In [None]:
ab_gs_test_acc = [x for x in ab_grid_search_results_with_scorer['mean_test_accuracy']]
ab_gs_tr_acc = [x for x in ab_grid_search_results_with_scorer['mean_train_accuracy']]
ab_gs_lab = ab_grid_search_results_with_scorer['param_n_estimators'].data
line1, = plt.plot(ab_gs_lab , ab_gs_tr_acc, 'purple', label="Train Accuracy")
line2, = plt.plot(ab_gs_lab, ab_gs_test_acc, 'pink', label="Test Accuracy")
plt.rcParams.update({'font.size': 12})
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.rcParams.update({'font.size': 26})
plt.ylabel('Accuracy')
plt.xlabel('n estimators')
plt.title("AdaBoost with GridSearch: Accuracy for different N estimators")
plt.show()

# Stacks

In [None]:
dTree = tree.DecisionTreeClassifier(max_depth = 4)
nb = MultinomialNB(alpha =0.1)
rf = RandomForestClassifier(max_depth=4, random_state=0, n_estimators = 50)
ab = AdaBoostClassifier(dTree, n_estimators =50)
lr = LogisticRegression(solver = 'lbfgs')

In [None]:
dn = [dTree,nb]
train2, test2 = stacking(dn,train_X, y_train, test_X, regression=False, \
                           mode='oof_pred_bag', save_dir=None, metric=accuracy_score, \
                           n_folds=5, stratified=True,shuffle=True,verbose=2)

In [None]:
simp_model = dTree
simp_model = simp_model.fit(train2, y_train)
y_pred_simp2 = simp_model.predict(test2)
accuracy_score(y_test, y_pred_simp2)

In [None]:
dr =[dTree, rf]
r1_train, r1_test = stacking(dr,train_X, y_train, test_X, regression=False, \
                           mode='oof_pred_bag', save_dir=None, metric=accuracy_score, \
                           n_folds=5, stratified=True,shuffle=True,verbose=2)

In [None]:
simp_model = dTree
simp_model = simp_model.fit(r1_train, y_train)
y_pred_simp2 = simp_model.predict(r1_test)
accuracy_score(y_test, y_pred_simp2)

In [None]:
da =[dTree, ab]

In [None]:
r2_train, r2_test = stacking(da,train_X, y_train, test_X, regression=False, \
                           mode='oof_pred_bag', save_dir=None, metric=accuracy_score, \
                           n_folds=5, stratified=True,shuffle=True,verbose=1)

In [None]:
model2 = AdaBoostClassifier(dTree, n_estimators = 50)
    
model2 = model2.fit(r2_train, y_train)
y_pred2 = model2.predict(r2_test)
accuracy_score(y_test, y_pred2)

In [None]:
dl  = [dTree,lr]

In [None]:
r3_train, r3_test = stacking(dl,train_X, y_train, test_X, regression=False, \
                           mode='oof_pred_bag', save_dir=None, metric=accuracy_score, \
                           n_folds=5, stratified=True,shuffle=True,verbose=2)

In [None]:
model3 = lr
    
model3 = model3.fit(r3_train, y_train)
y_pred3 = model3.predict(r3_test)
accuracy_score(y_test, y_pred3)

In [None]:
nr = [nb, rf]
r2_train, r2_test = stacking(nr,train_X, y_train, test_X, regression=False, \
                           mode='oof_pred_bag', save_dir=None, metric=accuracy_score, \
                           n_folds=5, stratified=True,shuffle=True,verbose=1)

In [None]:
model2 = nb
    
model2 = model2.fit(r2_train, y_train)
y_pred2 = model2.predict(r2_test)
accuracy_score(y_test, y_pred2)

In [None]:
na = [nb, ab]
r2_train, r2_test = stacking(na,train_X, y_train, test_X, regression=False, \
                           mode='oof_pred_bag', save_dir=None, metric=accuracy_score, \
                           n_folds=5, stratified=True,shuffle=True,verbose=1)


In [None]:
model2 = nb
    
model2 = model2.fit(r2_train, y_train)
y_pred2 = model2.predict(r2_test)
accuracy_score(y_test, y_pred2)

In [None]:
na = [nb, ab]
r2_train, r2_test = stacking(na,train_X, y_train, test_X, regression=False, \
                           mode='oof_pred_bag', save_dir=None, metric=accuracy_score, \
                           n_folds=5, stratified=True,shuffle=True,verbose=1)

In [None]:
model2 = nb
    
model2 = model2.fit(r2_train, y_train)
y_pred2 = model2.predict(r2_test)
accuracy_score(y_test, y_pred2)

In [None]:
nl = [nb, lr]
r2_train, r2_test = stacking(nl,train_X, y_train, test_X, regression=False, \
                           mode='oof_pred_bag', save_dir=None, metric=accuracy_score, \
                           n_folds=5, stratified=True,shuffle=True,verbose=1)

In [None]:
model2 = lr
    
model2 = model2.fit(r2_train, y_train)
y_pred2 = model2.predict(r2_test)
accuracy_score(y_test, y_pred2)

In [None]:
ra = [rf, ab]
r2_train, r2_test = stacking(ra,train_X, y_train, test_X, regression=False, \
                           mode='oof_pred_bag', save_dir=None, metric=accuracy_score, \
                           n_folds=5, stratified=True,shuffle=True,verbose=1)

In [None]:
model2 = ab
    
model2 = model2.fit(r2_train, y_train)
y_pred2 = model2.predict(r2_test)
accuracy_score(y_test, y_pred2)

In [None]:
rl = [rf, lr]
r2_train, r2_test = stacking(rl,train_X, y_train, test_X, regression=False, \
                           mode='oof_pred_bag', save_dir=None, metric=accuracy_score, \
                           n_folds=5, stratified=True,shuffle=True,verbose=1)

In [None]:
model2 = lr
    
model2 = model2.fit(r2_train, y_train)
y_pred2 = model2.predict(r2_test)
accuracy_score(y_test, y_pred2)

In [None]:
al = [ab, lr]
r2_train, r2_test = stacking(al,train_X, y_train, test_X, regression=False, \
                           mode='oof_pred_bag', save_dir=None, metric=accuracy_score, \
                           n_folds=5, stratified=True,shuffle=True,verbose=1)

In [None]:
model2 = lr
    
model2 = model2.fit(r2_train, y_train)
y_pred2 = model2.predict(r2_test)
accuracy_score(y_test, y_pred2)

In [None]:
ndr =[nb,dTree, rf]
r1_train, r1_test = stacking(ndr,train_X, y_train, test_X, regression=False, \
                           mode='oof_pred_bag', save_dir=None, metric=accuracy_score, \
                           n_folds=5, stratified=True,shuffle=True,verbose=1)

In [None]:
model3 = nb  
model3 = model3.fit(r1_train, y_train)
y_pred3 = model3.predict(r1_test)
accuracy_score(y_test, y_pred3)

In [None]:
dln =[nb,dTree, lr]
r1_train, r1_test = stacking(dln,train_X, y_train, test_X, regression=False, \
                           mode='oof_pred_bag', save_dir=None, metric=accuracy_score, \
                           n_folds=5, stratified=True,shuffle=True,verbose=1)

In [None]:
model3 = lr
model3 = model3.fit(r1_train, y_train)
y_pred3 = model3.predict(r1_test)
accuracy_score(y_test, y_pred3)