In [2]:
import os
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

import seaborn as sns
from matplotlib import pyplot as plt

In [7]:
reddit_df = pd.read_json('Clean_Data')

In [8]:
Xtrain,Xtest = train_test_split(reddit_df,test_size=0.2,shuffle=True,stratify=reddit_df.y,random_state=1)

In [9]:
ytrain = Xtrain.pop('y')
ytest = Xtest.pop('y')

In [10]:
tf_idf = TfidfVectorizer()

In [11]:
tftrain = tf_idf.fit_transform(Xtrain.selftext)
tftest = tf_idf.transform(Xtest.selftext)

In [13]:
bc = BaggingClassifier()

In [14]:
knn = KNeighborsClassifier(n_neighbors=20)
dtc = DecisionTreeClassifier()

In [15]:
params = {'base_estimator':[knn,dtc],
          'n_estimators':[100,200,300,400,500],
          'max_samples':[100], 'max_features':[1,2,3,4,5,6,7,8,9,10],
          'bootstrap':[True], 'bootstrap_features':[True]}

gcv = GridSearchCV(bc,params,cv=5,verbose = 1,n_jobs=-1)

gcv.fit(tftrain,ytrain)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 14.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=None, oob_score=False, random_state=None,
         verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'base_estimator': [KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=20, p=2,
           weights='uniform'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features... 'max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'bootstrap': [True], 'bootstrap_features': [True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [16]:
gcv.best_estimator_

BaggingClassifier(base_estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=20, p=2,
           weights='uniform'),
         bootstrap=True, bootstrap_features=True, max_features=1,
         max_samples=100, n_estimators=100, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [17]:
bc = BaggingClassifier(gcv.best_estimator_)

In [19]:
probs = gcv.predict_proba(tftest)

In [22]:
acc1 = []
for c in np.linspace(0.1,0.9,num=100):
    yhat = ['NTA' if i[0] >= c else 'YTA' for i in probs]
    acc1.append(accuracy_score(ytest,yhat))

thresh = np.linspace(0.1,0.9,num=100)[acc1.index(max(acc1))]

yhat = ['NTA' if i[0] >= thresh else 'YTA' for i in probs]

print(classification_report(ytest,yhat))

print(accuracy_score(ytest,yhat))

print(ytest.value_counts(normalize=True).max())

              precision    recall  f1-score   support

         NTA       0.57      1.00      0.73       652
         YTA       0.00      0.00      0.00       487

   micro avg       0.57      0.57      0.57      1139
   macro avg       0.29      0.50      0.36      1139
weighted avg       0.33      0.57      0.42      1139

0.57243195785777
0.57243195785777


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[Next Section](./Summary.md)