In [89]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [90]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

In [91]:
from sklearn.linear_model import LogisticRegression

In [157]:
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score

In [93]:
from sklearn.pipeline import Pipeline

In [94]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

In [95]:
from sklearn.neighbors import KNeighborsClassifier

In [96]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [97]:
import xgboost as xg

In [98]:
import spacy

In [99]:
from nltk.corpus import stopwords

In [137]:
from nltk import word_tokenize 

In [138]:
from nltk.stem import WordNetLemmatizer

In [139]:
posts = pd.read_csv('posts.csv')

In [140]:
X = posts['selftext']
y = posts['target']

In [141]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify = y, 
                                                    random_state = 42)

In [142]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((18522,), (18522,), (6175,), (6175,))

In [143]:
y_train.value_counts(normalize=True)

0    0.693284
1    0.306716
Name: target, dtype: float64

In [144]:
import re

In [145]:
#X_train = [re.sub('http\S+','',post) for post in X_train]

In [146]:
nlp = spacy.load('en_core_web_md')

In [147]:
class spacy_tokenizer():
    def __init__(self):
        self
    def __call__(self,post):
        doc = nlp(post)
        return [token.lemma_ for token in doc if 
                         token.pos_.lower() not in ['aux','punct','cconj','det','space','conj','adp','pron','sym']]

In [148]:
tvec = TfidfVectorizer(stop_words = 'english', 
                       max_features = 1000, 
                       min_df = 2,
                       max_df = .9,
                       ngram_range = (1,1),
                       tokenizer = spacy_tokenizer())

In [149]:
X_train = tvec.fit_transform(X_train)




In [150]:
X_test = tvec.transform(X_test)

In [151]:
X_train = pd.DataFrame(X_train.toarray(), columns = tvec.get_feature_names_out())
X_test = pd.DataFrame(X_test.toarray(), columns = tvec.get_feature_names_out())

In [152]:
X_train.sum().sort_values(ascending = False).head(50)

vegan       1249.598444
eat          814.577687
just         706.280772
make         645.689529
animal       636.186979
food         586.315495
know         582.356413
want         525.673672
meat         489.845338
diet         487.714444
good         481.061734
think        474.931323
try          466.601340
feel         454.399555
people       449.176400
iâm        447.057466
plant        444.105976
use          442.698422
like         431.671991
really       429.907858
base         422.402375
say          402.781819
thank        387.616150
look         383.148770
time         382.283366
year         373.469761
day          372.818154
thing        353.280142
recipe       346.606304
[            334.038499
help         329.885639
product      327.226724
need         325.678891
love         313.379298
lot          310.016422
iâve       303.006584
start        301.346978
way          300.848584
milk         295.891218
meal         295.043665
buy          285.948762
itâs       265

In [22]:
model_dict = {'mnb': # 1st level 1st key
              {'model': # 2nd level 1st key
               ('mnb', MultinomialNB()),
               'params':{}
              },
  #            'knn': # 1st level 2nd key
   #           {'model': # 2nd level 1st key
    #           ('knn', KNeighborsClassifier()),
     #          'params':# 2nd level 2nd key
      #         {"knn__n_neighbors": [3,5,7,10],
       #         "knn__p":[1,2,3]}},
             'logr': # 1st level 2nd key
              {'model': # 2nd level 1st key
               ('logr', LogisticRegression()),
               'params':# 2nd level 2nd key
               {"logr__C": [1,10,100]}},
#             'abc': # 1st level 2nd key
 #             {'model': # 2nd level 1st key
  #             ('abc', AdaBoostClassifier()),
   #            'params':# 2nd level 2nd key
    #           {"abc__n_estimators": [30,50,100],
     #           "abc__learning_rate": [0.2,0.4,0.6]}}
             }

In [42]:
# set dictionary to house best estimators and params
best_models = {}
# for loop through the 3 1st level keys of model_model dict
for key, value in model_dict.items():
    # set pipeline with Count Vectorizer
    pipe = Pipeline([('tvec', TfidfVectorizer(stop_words = stopwords, 
                                              max_features = 1000, 
                                              min_df = 2,
                                              max_df = .9,
                                              ngram_range = (1,1),
                                              tokenizer = LemmaTokenizer()))])
    # append current model iteration tuple
    pipe.steps.append(value['model'])
    print(pipe)
    gs = GridSearchCV(pipe, # set as iteration of pipe
                  value['params'], # set as iteration of pipe_params
                      cv=5, # Cross Validation of 5
                      n_jobs=-1) # Unlock CPU to help process
    # fit the current iteration of GridSearchCV
    gs.fit(X_train, y_train)
    # save the best model, best params, and the pipe for scoring 
    best_models[key] = {'model': gs.best_estimator_,
                        'params': gs.best_params_,
                        'pipe': pipe}
    print(best_models)

NameError: name 'model_dict' is not defined

In [43]:
for key, value in best_models.items():
    value['model'].fit(X_train,y_train)
    print(value['model'].score(X_train, y_train), value['model'].score(X_test, y_test))

In [170]:
scores = []

# LogReg

In [176]:
gs_logr = GridSearchCV(LogisticRegression(),
                             param_grid = {'C':np.logspace(0.001,1000,num=10)},
                             cv=5,                
                             verbose=1,                  
                             n_jobs = -1)
gs_logr.fit(X_train, y_train)
logr = gs_logr.best_estimator_
logr.fit(X_train,y_train)


  return _nx.power(base, y)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


LogisticRegression(C=1.0023052380778996)

In [215]:
scores.append(['logr',
               [(k,v) for k,v in gs_logr.best_params_.items()],
               logr.score(X_train, y_train),
               logr.score(X_test, y_test),
               cross_val_score(logr,X_test,y_test,cv=5).mean()]) 

SyntaxError: invalid syntax (2251852093.py, line 2)

In [214]:
scores

[['logr',
  [('C', 1.0023052380778996)],
  0.8874851527912753,
  0.8723886639676114,
  0.8618623481781376]]

In [223]:
pd.DataFrame(logr.predict(X_test)).to_csv('./preds/logr.csv')

#  MNB

In [216]:
gs_mnb = GridSearchCV(MultinomialNB(),
                  param_grid = {'alpha':np.linspace(0,1,num=10)},
                  cv=5,
                 verbose=1,
                 n_jobs = -1)
gs_mnb.fit(X_train, y_train)
mnb = gs_mnb.best_estimator_
mnb.fit(X_train,y_train)
scores.append(['mnb',
               [(k,v) for k,v in gs_mnb.best_params_.items()],
               mnb.score(X_train, y_train),
               mnb.score(X_test, y_test),
               cross_val_score(mnb,X_test,y_test,cv=5).mean()]) 

Fitting 5 folds for each of 10 candidates, totalling 50 fits




In [224]:
pd.DataFrame(mnb.predict(X_test)).to_csv('./preds/mnb.csv')

In [225]:
scores

[['logr',
  [('C', 1.0023052380778996)],
  0.8874851527912753,
  0.8723886639676114,
  0.8618623481781376],
 ['mnb',
  [('alpha', 0.0)],
  0.8325234855847101,
  0.825748987854251,
  0.8242914979757086]]

# Gaussian NB

In [226]:
gs_gnb = GridSearchCV(GaussianNB(),
                  param_grid = {'var_smoothing':[1e-7,1e-8,1e-9,1e-10]},
                  cv=5,
                 verbose=1,
                 n_jobs = -1)
gs_gnb.fit(X_train, y_train)
gnb = gs.best_estimator_
gnb.fit(X_train,y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




MultinomialNB(alpha=0.0)

In [227]:
scores.append(['gnb',
               [(k,v) for k,v in gs_gnb.best_params_.items()],
               gnb.score(X_train, y_train),
               gnb.score(X_test, y_test),
               cross_val_score(mnb,X_test,y_test,cv=5).mean()]) 



In [228]:
pd.DataFrame(gnb.predict(X_test)).to_csv('./preds/gnb.csv')

In [229]:
scores

[['logr',
  [('C', 1.0023052380778996)],
  0.8874851527912753,
  0.8723886639676114,
  0.8618623481781376],
 ['mnb',
  [('alpha', 0.0)],
  0.8325234855847101,
  0.825748987854251,
  0.8242914979757086],
 ['gnb',
  [('var_smoothing', 1e-07)],
  0.8325234855847101,
  0.825748987854251,
  0.8242914979757086]]

# Bernoulli NB

# KNN

In [230]:
gs = GridSearchCV(KNeighborsClassifier(),
                  param_grid = {'n_neighbors':[3,5,7,10,15]},
                  cv=5,
                 verbose=1,
                 n_jobs = -1)
gs.fit(X_train, y_train)
knn = gs.best_estimator_
knn.fit(X_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


KNeighborsClassifier(n_neighbors=3)

In [231]:
scores.append(['knn',
               [(k,v) for k,v in gs_knn.best_params_.items()],
               knn.score(X_train, y_train),
               knn.score(X_test, y_test),
               cross_val_score(knn,X_test,y_test,cv=5).mean()]) 

NameError: name 'gs_knn' is not defined

In [None]:
pd.DataFrame(knn.predict(X_test)).to_csv('./preds/gnb.csv')

In [None]:
scores

# Decision Tree

In [36]:
gs = GridSearchCV(DecisionTreeClassifier(),
                  param_grid = {'max_depth':[3,5,7]},
                  cv=5,
                 verbose=1,
                 n_jobs = -1)
gs.fit(X_train, y_train)
dt = gs.best_estimator_
dt.fit(X_train,y_train)
dt.score(X_train, y_train), dt.score(X_test, y_test)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


(0.831497678436454, 0.8297975708502024)

# Random Forest

In [37]:
gs = GridSearchCV(RandomForestClassifier(),
                  param_grid = {
                      'n_estimators':[30,50,100],
                      'max_depth':[3,5,7]},
                  cv=5,
                 verbose=1,
                 n_jobs = -1)
gs.fit(X_train, y_train)
rf = gs.best_estimator_
rf.fit(X_train,y_train)
rf.score(X_train, y_train), rf.score(X_test, y_test)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


(0.77286470143613, 0.7674493927125506)

# AdaBoost

In [None]:
gs = GridSearchCV(AdaBoostClassifier(),
                  param_grid = {
                      'n_estimators':[30,50,100,200]},
                  cv=5,
                 verbose=1,
                 n_jobs = -1)
gs.fit(X_train, y_train)
abc = gs.best_estimator_
abc.fit(X_train,y_train)
abc.score(X_train, y_train), abc.score(X_test, y_test)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


# XGBoost

In [None]:
import re

In [77]:
regex = re.compile(r"\[|\]|<", re.IGNORECASE)

In [78]:
# cite source
[regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_train.columns]

['!',
 '#',
 '$',
 '%',
 '&',
 "'",
 "''",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '--',
 '.',
 '..',
 '...',
 '....',
 '/',
 '//www.youtube.com/watch',
 '1',
 '1.',
 '1/2',
 '10',
 '100',
 '15',
 '2',
 '2.',
 '20',
 '3',
 '3.',
 '30',
 '4',
 '5',
 '50',
 '6',
 '7',
 '8',
 ':',
 ';',
 '?',
 '@',
 '_',
 '\\',
 '\\-',
 '_',
 '``',
 'able',
 'absolutely',
 'abuse',
 'according',
 'acid',
 'action',
 'active',
 'activism',
 'activist',
 'actual',
 'actually',
 'add',
 'added',
 'adding',
 'advance',
 'advice',
 'age',
 'ago',
 'agree',
 'agriculture',
 'allergic',
 'allergy',
 'allowed',
 'almond',
 'alternative',
 'amazing',
 'american',
 'amp',
 'animal',
 'answer',
 'anxiety',
 'anybody',
 'anymore',
 'anyways',
 'app',
 'apparently',
 'apple',
 'appreciate',
 'appreciated',
 'approach',
 'area',
 'arenâ\x80\x99t',
 'argument',
 'article',
 'ask',
 'asked',
 'asking',
 'assume',
 'ate',
 'auto=webp',
 'available',
 'average',
 'avocado',
 'avoid',
 'aware',
 'away',
 'b12',
 'baby',
 'baco

In [44]:
gs = GridSearchCV(xg.XGBClassifier(),
                  param_grid = {
                      'n_estimators':[30,50,100,200]},
                  cv=5,
                 verbose=1,
                 n_jobs = -1)
gs.fit(X_train, y_train)
xgb = gs.best_estimator_.fit
xgb.fit(X_train,y_train)
xgb.score(X_train, y_train), xgb.score(X_test, y_test)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


KeyboardInterrupt: 

# Stacking

In [None]:
from sklearn.ensemble import StackingClassifier

In [None]:
stack = StackingClassifier(estimators = [('logr',logr),
                                         ('mnb',mnb),
                                         ('knn',knn),
                                         ('dt',dt)
                                        ])

In [None]:
stack.fit(X_train,y_train)
stack.score(X_train,y_train),stack.score(X_test,y_test)