In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

In [3]:
from sklearn.linear_model import LogisticRegression

In [4]:
from sklearn.model_selection import GridSearchCV, train_test_split

In [5]:
from sklearn.pipeline import Pipeline

In [6]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB

In [7]:
from sklearn.neighbors import KNeighborsClassifier

In [83]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [93]:
import spacy

In [9]:
posts = pd.read_csv('posts.csv')

In [142]:
posts['selftext'][1]

'Hi everyone, so Iâ\x80\x99m vegan and rescued some chickens that were abandoned. They started laying eggs so I started feeding the eggs back to them because I found that there is an implant to make them stop laying eggs is illegal in my country. At first I just fed the eggs to my chickens raw but later on I found it more convenient to cook them first, because that way I can cook the egg shells and grit right into the mixture and I have several disabled chickens who have trouble eating grit, in any other form other than with the scrambled egg mixture. I posted that I do this online and another vegan commented saying â\x80\x9ctrue actual/ethical vegansâ\x80\x9d do not cook eggs for their chickens first, they simply crack them open raw. She also said some other horrible things, to summarize Iâ\x80\x99m not a real vegan and a horrible chicken owner and she feels REALLY bad for my chickens. Etcetera.\r\n? I tried to ask but she wouldnâ\x80\x99t elaborate so I just ended up blocking her, so

In [118]:
X = posts['selftext']
y = posts['target']

In [119]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify = y, 
                                                    random_state = 42)

In [120]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((18522,), (18522,), (6175,), (6175,))

In [121]:
y_train.value_counts(normalize=True)

0    0.693284
1    0.306716
Name: target, dtype: float64

#  Vectorizers

In [129]:
nlp = spacy.load('en_core_web_md')

In [None]:
for post in nlp.pipe(X_train):
    tokens = 

In [135]:
def spacy_tokenizer(post):
    return [token.lemma_ for token in post in nlp.pipe(X_train) if token.pos_.lower() not in ['aux','punct','cconj','det','space','conj','adp','pron']]

In [136]:
tvec = TfidfVectorizer(
    min_df = 2,
    max_df = .95,
    max_features = 1000,
    stop_words = 'english',
    tokenizer = spacy_tokenizer)

In [137]:
X_train_tvec = tvec.fit_transform(X_train)

KeyboardInterrupt: 

In [143]:
X_train = [nlp(post) for post in X_train]

KeyboardInterrupt: 

In [None]:
pd.DataFrame(X_train_tvec.toarray(), columns = tvec.get_feature_names_out())

In [91]:
tokens = []
for token in X_train[0]:
    tk_dict = {
        'text': token.text,
        'POS': token.pos_,
        'POS_exp':spacy.explain(token.pos_),
        'dependency': token.dep_,
        'dep_exp': spacy.explain(token.dep_),
        'lemma': token.lemma_
    }
    tokens.append(tk_dict)

AttributeError: 'str' object has no attribute 'text'

In [15]:
# tvec_params = {
    'tvec__min_df': [2,3],
    'tvec__max_df': [.9,.95],
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words':['english',None],
    'tvec__max_features':[1000,2000]
#}

# LogReg

In [112]:
logr_pipe = Pipeline([
    ('tvec',TfidfVectorizer()),
    ('logr',LogisticRegression())
])

In [113]:
logr_params = {
    'logr__C':[0.1,0.5,0.9,1]
}

In [114]:
gs = GridSearchCV(logr_pipe,
                  param_grid = logr_params,
                  cv=5,
                 verbose=1,
                 n_jobs = -1)

In [115]:
gs.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('logr', LogisticRegression())]),
             n_jobs=-1, param_grid={'logr__C': [0.1, 0.5, 0.9, 1]}, verbose=1)

In [116]:
gs.best_estimator_.fit(X_train,y_train)

Pipeline(steps=[('tvec', TfidfVectorizer()), ('logr', LogisticRegression(C=1))])

In [117]:
gs.best_estimator_.score(X_train, y_train), gs.best_estimator_.score(X_test, y_test)

(0.9094590217039197, 0.8744939271255061)

#  MNB

In [37]:
mnb_pipe = Pipeline(steps = [
    ('tvec',TfidfVectorizer()),
    ('mnb',MultinomialNB())
])

In [44]:
mnb_params = {
    'mnb__alpha':[0.1,0.5,0.9,1],
}

In [45]:
gs = GridSearchCV(mnb_pipe,
                  param_grid = mnb_params,
                  cv=5,
                 verbose=1,
                 n_jobs = -1)

In [46]:
gs.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('mnb', MultinomialNB())]),
             n_jobs=-1, param_grid={'mnb__alpha': [0.1, 0.5, 0.9, 1]},
             verbose=1)

In [47]:
gs.best_estimator_.fit(X_train,y_train)

Pipeline(steps=[('tvec', TfidfVectorizer()), ('mnb', MultinomialNB(alpha=0.1))])

In [48]:
gs.best_estimator_.score(X_train, y_train), gs.best_estimator_.score(X_test, y_test)

(0.8934780261310874, 0.8262348178137652)

# Gaussian NB

In [69]:
gnb_pipe = Pipeline(steps = [
    ('tvec',TfidfVectorizer()),
    ('gnb',GaussianNB())
])

In [70]:
gnb_params = {
    'gnb__var_smoothing':[1e-8,1e-9,1e-10],
}

In [71]:
gs = GridSearchCV(gnb_pipe,
                  param_grid = gnb_params,
                  cv=5,
                 verbose=1,
                 n_jobs = -1)

In [72]:
gs.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


15 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Ari\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Ari\anaconda3\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Ari\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 245, in fit
    return self._partial_fit(
  File "C:\Users\Ari\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [None]:
gs.best_estimator_.fit(X_train,y_train)

In [None]:
gs.best_estimator_.score(X_train, y_train), gs.best_estimator_.score(X_test, y_test)

# Bernoulli NB

# KNN

In [49]:
knn_pipe = Pipeline(steps = [
    ('tvec',TfidfVectorizer()),
    ('knn',KNeighborsClassifier())
])

In [50]:
knn_params = {
    'knn__n_neighbors':[3,5,7,11]
}

In [51]:
gs = GridSearchCV(knn_pipe,
                  param_grid = knn_params,
                  cv=5,
                 verbose=1,
                 n_jobs = -1)

In [52]:
gs.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('knn', KNeighborsClassifier())]),
             n_jobs=-1, param_grid={'knn__n_neighbors': [3, 5, 7, 11]},
             verbose=1)

In [53]:
gs.best_estimator_.fit(X_train,y_train)

Pipeline(steps=[('tvec', TfidfVectorizer()),
                ('knn', KNeighborsClassifier(n_neighbors=11))])

In [54]:
gs.best_estimator_.score(X_train, y_train), gs.best_estimator_.score(X_test, y_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


(0.8408919123204838, 0.8017813765182186)

# Decision Tree

In [55]:
dt_pipe = Pipeline(steps = [
    ('tvec',TfidfVectorizer()),
    ('dt',DecisionTreeClassifier())
])

In [56]:
dt_params = {
    'dt__max_depth':[3,5,7]
}

In [57]:
gs = GridSearchCV(dt_pipe,
                  param_grid = dt_params,
                  cv=5,
                 verbose=1,
                 n_jobs = -1)

In [58]:
gs.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('dt', DecisionTreeClassifier())]),
             n_jobs=-1, param_grid={'dt__max_depth': [3, 5, 7]}, verbose=1)

In [59]:
gs.best_estimator_.fit(X_train,y_train)

Pipeline(steps=[('tvec', TfidfVectorizer()),
                ('dt', DecisionTreeClassifier(max_depth=7))])

In [60]:
gs.best_estimator_.score(X_train, y_train), gs.best_estimator_.score(X_test, y_test)

(0.8297160133894828, 0.8212145748987855)

# Random Forest

In [77]:
rf_pipe = Pipeline(steps = [
    ('tvec',TfidfVectorizer()),
    ('rf',RandomForestClassifier())
])

In [78]:
rf_params = {
    'rf__n_estimators':[30,50,100,200],
    'rf__max_depth':[3,5,7]
}

In [79]:
gs = GridSearchCV(rf_pipe,
                  param_grid = rf_params,
                  cv=5,
                 verbose=1,
                 n_jobs = -1)

In [80]:
gs.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('rf', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'rf__max_depth': [3, 5, 7],
                         'rf__n_estimators': [30, 50, 100, 200]},
             verbose=1)

In [81]:
gs.best_estimator_.fit(X_train,y_train)

Pipeline(steps=[('tvec', TfidfVectorizer()),
                ('rf', RandomForestClassifier(max_depth=7, n_estimators=30))])

In [82]:
gs.best_estimator_.score(X_train, y_train), gs.best_estimator_.score(X_test, y_test)

(0.6964690638160026, 0.6947368421052632)

# AdaBoost

In [84]:
abc_pipe = Pipeline(steps = [
    ('tvec',TfidfVectorizer()),
    ('abc',AdaBoostClassifier())
])

In [85]:
abc_params = {
    'abc__n_estimators':[30,50,100,200],
}

In [86]:
gs = GridSearchCV(abc_pipe,
                  param_grid = abc_params,
                  cv=5,
                 verbose=1,
                 n_jobs = -1)

In [87]:
gs.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('abc', AdaBoostClassifier())]),
             n_jobs=-1, param_grid={'abc__n_estimators': [30, 50, 100, 200]},
             verbose=1)

In [88]:
gs.best_estimator_.fit(X_train,y_train)

Pipeline(steps=[('tvec', TfidfVectorizer()),
                ('abc', AdaBoostClassifier(n_estimators=100))])

In [89]:
gs.best_estimator_.score(X_train, y_train), gs.best_estimator_.score(X_test, y_test)

(0.8701544109707375, 0.8604048582995951)

# XGBoost