## Import Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
filepath = "/Users/jackmorrissey/code/rradulov/dreaddit/raw_data/model_processed_data.csv"
test_filepath = "/Users/jackmorrissey/code/rradulov/dreaddit/raw_data/model_processed_data_test.csv"

df = pd.read_csv(filepath)
df_test = pd.read_csv(test_filepath)

y_train = df.label
X_train = df.drop(columns=['label','id'])
y_test = df_test.label
X_test = df_test.drop(columns=['label','id'])

In [3]:
X_train.shape

(2833, 58)

In [4]:
X_test.shape

(715, 58)

## Decision Tree Classifier

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()

parameters = {'max_depth' : range(2,21,2),
              'max_features': range(1,X_train.shape[1])
             }


grid_search = GridSearchCV(decision_tree, parameters, n_jobs=-1,
                           verbose=1, scoring = 'accuracy',
                           refit=True, cv=5).fit(X_train, y_train)

Fitting 5 folds for each of 570 candidates, totalling 2850 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 2677 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 2850 out of 2850 | elapsed:   10.3s finished


In [6]:
decision_tree.get_params().keys()

dict_keys(['ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'presort', 'random_state', 'splitter'])

In [7]:
grid_search.best_params_

{'max_depth': 4, 'max_features': 33}

In [8]:
grid_search.best_score_

0.6974884862988515

In [9]:
# from sklearn.model_selection import cross_val_score
# accuracy = cross_val_score(grid_search, X_train, y_train, cv=10, n_jobs=-1, scoring='accuracy').mean()
# accuracy

## Random Forest Classifier

In [10]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV

# random_forest = RandomForestClassifier()

# params_RF ={"max_depth": range(10,45,5), 
#             "min_samples_leaf": range(1,15,5),
#             "n_estimators":range(50,300,25)}

# grid_search_RF = GridSearchCV(random_forest, params_RF, n_jobs=-1,
#                            verbose=1, scoring = 'accuracy',
#                            refit=True, cv=5).fit(X_train, y_train)

In [11]:
# grid_search_RF.best_params_

In [12]:
# grid_search_RF.best_score_

In [40]:
from sklearn.ensemble import RandomForestClassifier

best_RF_model = RandomForestClassifier(max_depth=25, min_samples_leaf=6, n_estimators=250)
best_RF_model.fit(X_train, y_train).score(X_test, y_test)

0.7286713286713287

## SVC

In [14]:
# from sklearn.svm import SVC

# params_SVC ={"C": [0.001, 0.01, 0.1, 1], 
#             "kernel": ('linear', 'poly', 'rbf')
#             }

# model_SVC = SVC()

# grid_search_SVC = GridSearchCV(model_SVC, params_SVC, n_jobs=-1,
#                            verbose=1, scoring = 'accuracy',
#                            refit=True, cv=5).fit(X_train, y_train)

In [15]:
# model_SVC.get_params

In [16]:
# best_SVC_params = grid_search_SVC.best_params_
# best_SVC_params

In [17]:
# grid_search_SVC.best_score_

In [39]:
from sklearn.svm import SVC

best_SVC_model = SVC(degree=2, gamma='scale', kernel='poly', coef0=5, probability=True)
best_SVC_model.fit(X_train, y_train).score(X_test, y_test)

0.7454545454545455

## Naive Bayes

In [23]:
# from sklearn.naive_bayes import MultinomialNB

# params_NB = {"alpha": [0.001, 0.01, 0.1, 1]}

# model_NB = MultinomialNB()
# grid_search_NB = GridSearchCV(model_NB, params_NB, n_jobs=-1,
#                                verbose=1, scoring = 'accuracy', 
#                                refit=True, cv=5).fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.1s finished


In [25]:
# grid_search_NB.best_params_

{'alpha': 0.001}

In [29]:
from sklearn.naive_bayes import MultinomialNB
best_NB_model = MultinomialNB(alpha=0.001)

## Gradient Boosting Classifier

In [48]:
from sklearn.ensemble import GradientBoostingClassifier

model_gb = GradientBoostingClassifier()

params_gb = {
            "learning_rate": [0.001, 0.01, 0.1],
            "n_estimators": range(5,150,5)
            }

grid_search_gb = GridSearchCV(model_gb, params_gb, n_jobs=-1,
                               verbose=1, scoring = 'accuracy', 
                               refit=True, cv=5).fit(X_train, y_train)

Fitting 5 folds for each of 87 candidates, totalling 435 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done 435 out of 435 | elapsed:  1.0min finished


In [49]:
grid_search_gb.best_params_

{'learning_rate': 0.1, 'n_estimators': 65}

In [50]:
from sklearn.ensemble import GradientBoostingClassifier

best_model_gb = GradientBoostingClassifier(learning_rate=0.1, n_estimators=65)
best_model_gb.fit(X_train, y_train).score(X_test, y_test)

0.7468531468531469

## KNN

In [61]:
from sklearn.neighbors import KNeighborsClassifier

model_KNN = KNeighborsClassifier()

params_KNN = {
            "algorithm": ('ball_tree','kd_tree','brute'),
            "p": [1,2],
            "weights":('uniform', 'distance'),
            'n_neighbors': range(5,50,5)
            }

grid_search_KNN = GridSearchCV(model_KNN, params_KNN, n_jobs=-1,
                               verbose=1, scoring = 'accuracy', 
                               refit=True, cv=5).fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 320 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:    6.6s finished


In [62]:
grid_search_KNN.best_params_

{'algorithm': 'ball_tree', 'n_neighbors': 40, 'p': 1, 'weights': 'distance'}

In [63]:
grid_search_KNN.best_score_

0.7289116981696486

In [64]:
from sklearn.neighbors import KNeighborsClassifier
best_model_KNN = KNeighborsClassifier(algorithm='ball_tree', n_neighbors=40, p=1, weights='distance')

In [65]:
best_model_KNN.fit(X_train, y_train).score(X_test, y_test)

0.7356643356643356

## Logistic Regression

In [86]:
from sklearn.linear_model import LogisticRegression

best_log_reg = LogisticRegression(C=1, solver='saga')
best_log_reg.fit(X_train, y_train).score(X_test, y_test)

0.7440559440559441

## Voting Classifier

In [90]:
from sklearn.ensemble import VotingClassifier

estimators=[
            #('Random_Forest', best_RF_model), 
            ('SVC', best_SVC_model),
            ('NB', best_NB_model),
            ('GBC', best_model_gb),
            #("LOGISTIC", best_log_reg)
            #('KNN', best_model_KNN)
]

voting_classifier = VotingClassifier(estimators= estimators, voting='soft')

voting_classifier.fit(X_train, y_train).score(X_test, y_test)

0.7636363636363637

## Stacking Classifier

In [88]:
from sklearn.ensemble import StackingClassifier


stacking_classifier = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(C=1, solver='saga'), n_jobs = -1)
stacking_classifier.fit(X_train, y_train).score(X_test, y_test)

0.7524475524475525