In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, matthews_corrcoef
from math            import sqrt
from sklearn.metrics import confusion_matrix, r2_score
from sklearn.metrics import matthews_corrcoef, accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pickle

In [2]:
df = pd.read_csv("../data/reddit_data_preprocessed.csv")

df.head()

Unnamed: 0,id,author,flair,title,body,comments,permalink,processed_url,body-empty,comments-empty,body_comments_combined,lemmatized_body_comments,lemmatized_title,lemmatized_body_only,lemmatized_comments_only,lemmatized_all_params
0,fqgrjr,legithousefly,Sports,My school’s 1980ish sports day score board,,Well when i was studying there i heard it from...,/r/india/comments/fqgrjr/my_schools_1980ish_sp...,my schools 1980ish sports day score board,True,False,Well when i was studying there i heard it from...,Well when i wa studying there i heard it from ...,My school s 1980ish sport day score board,,Well when i wa studying there i heard it from ...,My school s 1980ish sport day score boardWell ...
1,fhvl03,hipporama,Sports,Delhi Deputy Chief Minister Manish Sisodia: We...,,The players would still be at risk though.,/r/india/comments/fhvl03/delhi_deputy_chief_mi...,delhi deputy chief minister manish sisodia we,True,False,The players would still be at risk though.,The player would still be at risk though,Delhi Deputy Chief Minister Manish Sisodia We ...,,The player would still be at risk though,Delhi Deputy Chief Minister Manish Sisodia We ...
2,fl5tj6,d2a2d2a,Sports,What is a sport every Indian born before 1990 ...,One of the only team sports which can be playe...,We played this as kids too! But I don’t think ...,/r/india/comments/fl5tj6/what_is_a_sport_every...,what is a sport every indian born before 1990,False,False,One of the only team sports which can be playe...,One of the only team sport which can be played...,What is a sport every Indian born before 1990 ...,One of the only team sport which can be played...,We played this a kid too But I don t think I e...,What is a sport every Indian born before 1990 ...
3,exk8n6,hipporama,Sports,Govt cuts National Sports Federations & SAI bu...,,4D chess right there...,/r/india/comments/exk8n6/govt_cuts_national_sp...,govt cuts national sports federations sai budget,True,False,4D chess right there...,4D chess right there,Govt cut National Sports Federations SAI budge...,,4D chess right there,Govt cut National Sports Federations SAI budge...
4,fhb1v6,wildergears,Sports,Is snowboarding as a sport emerging trend in I...,Just like camping/hiking has boomed this decad...,But recent events like khelo india 2020 - Wint...,/r/india/comments/fhb1v6/is_snowboarding_as_a_...,is snowboarding as a sport emerging trend in i...,False,False,Just like camping/hiking has boomed this decad...,Just like camping hiking ha boomed this decade...,Is snowboarding a a sport emerging trend in India,Just like camping hiking ha boomed this decade...,But recent event like khelo india 2020 Winter ...,Is snowboarding a a sport emerging trend in In...


In [3]:
df.dtypes

id                          object
author                      object
flair                       object
title                       object
body                        object
comments                    object
permalink                   object
processed_url               object
body-empty                    bool
comments-empty                bool
body_comments_combined      object
lemmatized_body_comments    object
lemmatized_title            object
lemmatized_body_only        object
lemmatized_comments_only    object
lemmatized_all_params       object
dtype: object

In [4]:
# Simple NaN removal before running the models
df = df.replace(np.nan, "", regex=True) 

### RUNNING MODELS

We will start off by choosing the params in the following order:
1. Body
2. Comments
3. Title
4. URL (permalink)
5. Body + Comments
6. Body + Comments + Title + URL

In [5]:
choice_X = {
    "body": df["lemmatized_body_only"],
    "comments": df["lemmatized_comments_only"], 
     "title": df["lemmatized_title"],
    "URL": df["processed_url"],
    "body + comments": df["lemmatized_body_comments"],
    "All Params": df["lemmatized_all_params"] 
}

In [6]:
X = choice_X["All Params"]
y = df["flair"]

In [7]:
# We define a function to model the data as per the parameters chosen by us
# Stratify param is used to match the distribution of the flairs
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    random_state=42,
                                                    stratify = y)

### LOGISTIC REGRESSION

#### using Count Vectorizer

In [8]:
# Pipeline initiate
cvec_lr_pipe = Pipeline([("cvec", CountVectorizer()),
                         ("log_reg", LogisticRegression())])

# Hyperparameters setup
cvec_pipe_params = {
    "cvec__max_features": [125],
    "cvec__ngram_range": [(1,2)],
    "cvec__stop_words" : [None]
}

# Grid Search Object
cvec_lr_gs = GridSearchCV(cvec_lr_pipe,
                          param_grid=cvec_pipe_params,
                          cv=5)

# Model Fit
cvec_lr_gs.fit(X_train, y_train)

TypeError: __init__() got an unexpected keyword argument 'penalty'

In [213]:
cvec_lr_train_preds = cvec_lr_gs.predict(X_train)

cvec_lr_preds = cvec_lr_gs.predict(X_test)

cvec_lr_probs = cvec_lr_gs.predict_proba(X_test)

In [214]:
accuracy_score(y_train, cvec_lr_train_preds)

0.6841415465268676

In [215]:
accuracy_score(y_test, cvec_lr_preds)

0.4

#### using TFIDF VECTORIZATION

In [216]:
# Pipeline
tvec_lr_pipe = Pipeline([("tvec", TfidfVectorizer()), 
                         ("log_reg", LogisticRegression())])

tvec_pipe_params = {
    "tvec__max_features": [650], 
    "tvec__ngram_range" : [(1,1)], 
    "tvec__stop_words"  : [None]
}

tvec_lr_gs = GridSearchCV(tvec_lr_pipe,
                          param_grid=tvec_pipe_params, 
                          cv = 5)

tvec_lr_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        no

In [217]:
tvec_lr_train_preds = tvec_lr_gs.predict(X_train)

tvec_lr_preds       = tvec_lr_gs.predict(X_test) 

tvec_lr_probas     = tvec_lr_gs.predict(X_test)

In [218]:
accuracy_score(y_train, tvec_lr_train_preds)

0.9017038007863696

In [219]:
accuracy_score(y_test, tvec_lr_preds)

0.6823529411764706

### CREATING FUNCTIONS FOR EACH OF THE MODEL PRESENT

#### LOGISTIC REGRESSION

USING COUNT VECTORIZER

In [220]:
#FUNCTION FOR CVEC LR
def cvec_Logistic(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    random_state=42,
                                                    stratify = y)
    # Pipeline initiate
    cvec_lr_pipe = Pipeline([("cvec", CountVectorizer()),
                             ("log_reg", LogisticRegression())])

    # Hyperparameters setup
    cvec_pipe_params = {
        "cvec__max_features": [125],
        "cvec__ngram_range": [(1,2)],
        "cvec__stop_words" : [None]
    }

    # Grid Search Object
    cvec_lr_gs = GridSearchCV(cvec_lr_pipe,
                              param_grid=cvec_pipe_params,
                              cv=5)

    # Model Fit
    cvec_lr_gs.fit(X_train, y_train)
    
    cvec_lr_train_preds = cvec_lr_gs.predict(X_train)
    cvec_lr_preds = cvec_lr_gs.predict(X_test)
    cvec_lr_probs = cvec_lr_gs.predict_proba(X_test)
    
    print("Train Accuracy")
    print(accuracy_score(y_train, cvec_lr_train_preds))
    print("Test Accuracy")
    print(accuracy_score(y_test, cvec_lr_preds))
    
    print(confusion_matrix(y_test, cvec_lr_preds))
    
    return accuracy_score(y_train, cvec_lr_train_preds), accuracy_score(y_test, cvec_lr_preds)

USING TFIDR VECTORIZER

In [221]:
# FUNCTION FOR TVEC LR
def tvec_Logistic(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    random_state=42,
                                                    stratify = y)
    # Pipeline
    tvec_lr_pipe = Pipeline([("tvec", TfidfVectorizer()), 
                             ("log_reg", LogisticRegression())])

    tvec_pipe_params = {
        "tvec__max_features": [650], 
        "tvec__ngram_range" : [(1,1)], 
        "tvec__stop_words"  : [None]
    }

    tvec_lr_gs = GridSearchCV(tvec_lr_pipe,
                              param_grid=tvec_pipe_params, 
                              cv = 5)

    tvec_lr_gs.fit(X_train, y_train)
    
    tvec_lr_train_preds = tvec_lr_gs.predict(X_train)
    tvec_lr_preds       = tvec_lr_gs.predict(X_test) 
    tvec_lr_probas     = tvec_lr_gs.predict(X_test)
    
    print("Train Accuracy")
    print(accuracy_score(y_train, tvec_lr_train_preds))
    print("Test Accuracy")
    print(accuracy_score(y_test, tvec_lr_preds))
    
    print (confusion_matrix(y_test, tvec_lr_preds))
    
    return accuracy_score(y_train, tvec_lr_train_preds), accuracy_score(y_test, tvec_lr_preds)

### SUPPORT VECTOR CLASSIFIER

#### using Count Vectorizer

In [222]:
def cvec_SVC(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    random_state=42,
                                                    stratify = y)
    
    cvec_svc_pipe = Pipeline([
        ("cvec", CountVectorizer()),
        ("svc", SVC())
    ])
    
    cvec_pipe_params = {
        "cvec__max_features": [319], 
        "cvec__ngram_range" : [(1,2)], 
        "cvec__stop_words"  : [None],
        "svc__C"            : [1.0],
        "svc__kernel"       : ["rbf"],
        "svc__gamma"        : ["auto"]
    }
    
    cvec_svc_gs = GridSearchCV(
        cvec_svc_pipe, 
        param_grid = cvec_pipe_params, 
        cv         = 5
    )
    
    cvec_svc_gs.fit(X_train, y_train)
    
    cvec_svc_train_preds = cvec_svc_gs.predict(X_train)
    cvec_svc_preds       = cvec_svc_gs.predict(X_test)
    
    print("Train Accuracy")
    print(accuracy_score(y_train, cvec_svc_train_preds))
    print("Test Accuracy")
    print(accuracy_score(y_test, cvec_svc_preds))
    
    print(confusion_matrix(y_test, cvec_svc_preds))
    
    return accuracy_score(y_train, cvec_svc_train_preds), accuracy_score(y_test, cvec_svc_preds)

#### using TFID Vectorizer

In [223]:
def tvec_SVC(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    random_state=42,
                                                    stratify = y)
    
    # Setup pipeline
    tvec_svc_pipe = Pipeline([
        ("tvec", TfidfVectorizer()), 
        ("svc", SVC())
    ])

    # Setting hyperparameters
    tvec_pipe_params = {"tvec__max_features": [1], 
                        "tvec__ngram_range" : [(1,1)], 
                        "tvec__stop_words"  : [None],
                        "svc__C"            : [1.0],
                        "svc__kernel"       : ["rbf"],
                        "svc__gamma"        : ["auto"]}

    tvec_svc_gs = GridSearchCV(tvec_svc_pipe, 
                               param_grid = tvec_pipe_params, 
                               cv         = 5)

    # Fitting the model 
    tvec_svc_gs.fit(X_train, y_train);
    tvec_svc_train_preds = tvec_svc_gs.predict(X_train)
    tvec_svc_preds       = tvec_svc_gs.predict(X_test)
    
    print("Train Accuracy")
    print(accuracy_score(y_train, tvec_svc_train_preds))
    print("Test Accuracy")
    print(accuracy_score(y_test, tvec_svc_preds))
    
    print(confusion_matrix(y_test, tvec_svc_preds))
    
    return accuracy_score(y_train, tvec_svc_train_preds), accuracy_score(y_test, tvec_svc_preds)

### RANDOM FOREST CLASSIFIER 

#### using Count Vectoriser 

In [249]:
def cvec_RF(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    random_state=42,
                                                    stratify = y)
    # Creating the pipeline

    cvec_rf_pipe = Pipeline([("cvec", CountVectorizer()), 
                             ("rf", RandomForestClassifier(random_state = 42))])

    # Setting hyperparameters
    cvec_pipe_params = {"cvec__max_features"   : [1000], 
                        "cvec__ngram_range"    : [(1,1)], 
                        "cvec__stop_words"     : [None],
                        "rf__n_estimators"     : [72],
                        "rf__min_samples_split": [6],
                        "rf__min_samples_leaf" : [2],
                        "rf__max_depth"        : [20]}

    # grid search
    cvec_rf_gs = GridSearchCV(cvec_rf_pipe, 
                              param_grid = cvec_pipe_params, 
                              cv         = 5,
                              n_jobs     = 6)

    # Fitting the model to the training data
    cvec_rf_gs.fit(X_train, y_train);

    # Generating training predictions
    cvec_rf_train_preds = cvec_rf_gs.predict(X_train)

    # Generating test predictions
    cvec_rf_preds       = cvec_rf_gs.predict(X_test) 

    # Generating test probabilities
    cvec_rf_probas      = cvec_rf_gs.predict_proba(X_test)

    # Creating the pipeline
    cvec_rf_pipe = Pipeline([("cvec", CountVectorizer()), 
                             ("rf", RandomForestClassifier(random_state = 42))])

    # Setting CVEC and pipeline hyperparameters
    cvec_pipe_params = {"cvec__max_features"   : [1000], 
                        "cvec__ngram_range"    : [(1,1)], 
                        "cvec__stop_words"     : [None],
                        "rf__n_estimators"     : [72],
                        "rf__min_samples_split": [6],
                        "rf__min_samples_leaf" : [2],
                        "rf__max_depth"        : [20]}

    # Instantiating the grid search
    cvec_rf_gs = GridSearchCV(cvec_rf_pipe, 
                              param_grid = cvec_pipe_params, 
                              cv         = 5,
                              n_jobs     = 6)

    # Fitting the model to the training data
    cvec_rf_gs.fit(X_train, y_train);

    # Generating training predictions
    cvec_rf_train_preds = cvec_rf_gs.predict(X_train)
    # Generating test predictions
    cvec_rf_preds       = cvec_rf_gs.predict(X_test) 
    # Generating test probabilities
    cvec_rf_probas      = cvec_rf_gs.predict_proba(X_test)
    
    print("Train Accuracy")
    print(accuracy_score(y_train, cvec_rf_train_preds))
    print("Test Accuracy")
    print(accuracy_score(y_test, cvec_rf_preds))
    
    print(confusion_matrix(y_test, cvec_rf_preds))
    
    return accuracy_score(y_train, cvec_rf_train_preds), accuracy_score(y_test, cvec_rf_preds)

#### using TFIDF Vectorizer

In [225]:
def tvec_RF(X,y):
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    random_state=42,
                                                    stratify = y)
        # Creating the pipeline

    tvec_rf_pipe = Pipeline([("tvec", TfidfVectorizer()), 
                             ("rf", RandomForestClassifier(random_state = 42))])

    # Setting the TVEC and pipeline hyperparameters

    tvec_pipe_params = {"tvec__max_features"   : [250], 
                        "tvec__ngram_range"    : [(1,2)], 
                        "tvec__stop_words"     : [None],
                        "rf__n_estimators"     : [30],
                        "rf__min_samples_split": [6],
                        "rf__min_samples_leaf" : [2],
                        "rf__max_depth"        : [12]}

    # Instantiating the grid search

    tvec_rf_gs = GridSearchCV(tvec_rf_pipe, 
                              param_grid = tvec_pipe_params, 
                              cv         = 5,
                              n_jobs     = 6)

    # Fitting the model to the testing data

    tvec_rf_gs.fit(X_train, y_train);

    # Generating training predictions

    tvec_rf_train_preds = tvec_rf_gs.predict(X_train)

    # Generating test predictions

    tvec_rf_preds       = tvec_rf_gs.predict(X_test) 

    # Generating test probabilities

    tvec_rf_probas      = tvec_rf_gs.predict_proba(X_test)

    print("Train Accuracy")
    print(accuracy_score(y_train, tvec_rf_train_preds))
    print("Test Accuracy")
    print(accuracy_score(y_test, tvec_rf_preds))
    
    print(confusion_matrix(y_test, tvec_rf_preds))
    
    return accuracy_score(y_train, tvec_rf_train_preds), accuracy_score(y_test, tvec_rf_preds)

### RUNNING ACCURACY TESTS

In [226]:
y = df["flair"]
import warnings; warnings.simplefilter('ignore')
for i in choice_X:
    print("When Considering", i)
    print("\n\nUSING COUNT VECTORIZER")
    x_input = choice_X[i]
    print("Logistic Regression")
    cvec_Logistic(x_input, y)
    print("Support Vector Classifier")
    cvec_SVC(x_input, y)
    print("Random Forest Classifier")
    cvec_RF(x_input, y)
    
    print("\n\nUSING TFIDF VECTORIZER")
    print("Logistic Regression")
    tvec_Logistic(x_input, y)
    print("Support Vector Classifier")
    tvec_SVC(x_input, y)
    print("Random Forest Classifier")
    tvec_RF(x_input, y)

When Considering body


USING COUNT VECTORIZER
Logistic Regression
Train Accuracy
0.49934469200524245
Test Accuracy
0.2235294117647059
[[ 3  4  1  2  1  0  1  4  6  3  0]
 [ 1 10  5  1  0  3  0  3  2  0  0]
 [ 0  2 17  0  0  0  1  5  0  0  0]
 [ 1  2 14  2  0  0  2  3  0  1  0]
 [ 1  1 10  1 10  0  0  1  1  0  0]
 [ 1  6  9  0  0  5  0  1  3  0  0]
 [ 0  1 17  2  0  1  0  2  1  1  0]
 [ 3  1 10  1  1  1  1  3  1  2  1]
 [ 3  2 12  1  0  2  0  1  4  0  0]
 [ 1  1 10  1  1  1  1  1  5  3  0]
 [ 0  0  3  0  1  0  0  0  1  0  0]]
Support Vector Classifier
Train Accuracy
0.3918741808650065
Test Accuracy
0.23529411764705882
[[11  3  9  1  0  1  0  0  0  0  0]
 [ 4  9  6  1  0  4  0  1  0  0  0]
 [ 2  0 23  0  0  0  0  0  0  0  0]
 [ 5  2 18  0  0  0  0  0  0  0  0]
 [ 3  1 11  0 10  0  0  0  0  0  0]
 [ 5  4  9  0  0  7  0  0  0  0  0]
 [ 4  1 18  0  0  2  0  0  0  0  0]
 [ 6  1 16  0  0  2  0  0  0  0  0]
 [ 4  6 13  0  0  2  0  0  0  0  0]
 [ 6  2 15  0  0  2  0  0  0  0  0]
 [ 1  0  3  0 

##### **We observe that the Reddiquete Flair is not being predicted by any of the models present, which may be due to low number of data (both training and testing) available, and hence it is safe to neglect the Reddiquete flair**

reddiquette is the last column and last row of the confusion matrix

In [227]:
flairs = ["Sports", "Politics", "AskIndia", "Business/Finance", "Food", "Science/Technology", "Non-Political", "Photography", "Policy/Economy", "Scheduled", "[R]eddiquette"]

flairs

['Sports',
 'Politics',
 'AskIndia',
 'Business/Finance',
 'Food',
 'Science/Technology',
 'Non-Political',
 'Photography',
 'Policy/Economy',
 'Scheduled',
 '[R]eddiquette']

In [228]:
flairs[-1]

'[R]eddiquette'

In [229]:
# df1 contains all the values except where flair is Reddiquette
df1 = df[df["flair"] != '[R]eddiquette']

df1["flair"].unique()

array(['Sports', 'Politics', 'AskIndia', 'Business/Finance', 'Food',
       'Science/Technology', 'Non-Political', 'Photography',
       'Policy/Economy', 'Scheduled'], dtype=object)

#### DOING THE SAME PROCESS OVER FOR `df1`

In [230]:
y = df1["flair"]
choice_X = {
    "body": df1["lemmatized_body_only"],
    "comments": df1["lemmatized_comments_only"], 
     "title": df1["lemmatized_title"],
    "URL": df1["processed_url"],
    "body + comments": df1["lemmatized_body_comments"],
    "All Params": df1["lemmatized_all_params"] 
}

In [231]:
accuracy_sheet = []

accuracy_sheet

[]

In [232]:
import warnings; warnings.simplefilter('ignore')
for i in choice_X:
    print("When Considering", i)
    
    print("\n\nUSING COUNT VECTORIZER")
    x_input = choice_X[i]
    
    print("Logistic Regression")
    row_item = []
    row_item.append(i)
    a,b = cvec_Logistic(x_input, y)
    row_item.append("Count Vectorizer")
    row_item.append("LR")
    row_item.append(a)
    row_item.append(b)
    accuracy_sheet.append(row_item)
    
    print("Support Vector Classifier")
    c,d = cvec_SVC(x_input, y)
    row_item = []
    row_item.append(i)
    row_item.append("Count Vectorizer")
    row_item.append("SVC")
    row_item.append(c)
    row_item.append(d)
    accuracy_sheet.append(row_item)
    
    print("Random Forest Classifier")
    e,f = cvec_RF(x_input, y)
    row_item = []
    row_item.append(i)
    row_item.append("Count Vectorizer")
    row_item.append("RF")
    row_item.append(e)
    row_item.append(f)
    accuracy_sheet.append(row_item)
    
    
    print("\n\nUSING TFIDF VECTORIZER")
    
    print("Logistic Regression")
    a,b = tvec_Logistic(x_input, y)
    row_item = []
    row_item.append(i)
    row_item.append("TFIDF Vectorizer")
    row_item.append("LR")
    row_item.append(a)
    row_item.append(b)
    accuracy_sheet.append(row_item)
    
    print("Support Vector Classifier")
    c,d = tvec_SVC(x_input, y)
    row_item = []
    row_item.append(i)
    row_item.append("TFIDF Vectorizer")
    row_item.append("SVC")
    row_item.append(c)
    row_item.append(d)
    accuracy_sheet.append(row_item)
    
    print("Random Forest Classifier")
    e,f = tvec_RF(x_input, y)
    row_item = []
    row_item.append(i)
    row_item.append("TFIDF Vectorizer")
    row_item.append("RF")
    row_item.append(e)
    row_item.append(f)
    accuracy_sheet.append(row_item)

When Considering body


USING COUNT VECTORIZER
Logistic Regression
Train Accuracy
0.492
Test Accuracy
0.232
[[ 1  3  3  4  1  1  1  4  4  3]
 [ 0  9  5  0  1  2  0  4  3  1]
 [ 0  0 19  0  1  0  0  4  0  1]
 [ 1  2 15  1  0  0  2  3  0  1]
 [ 0  0  9  0 14  0  0  1  1  0]
 [ 0  5 13  0  0  3  0  2  2  0]
 [ 0  1 16  1  1  0  2  3  0  1]
 [ 2  1  9  1  2  0  2  4  1  3]
 [ 2  3 12  1  0  2  0  2  3  0]
 [ 0  1 10  1  2  0  2  2  5  2]]
Support Vector Classifier
Train Accuracy
0.39066666666666666
Test Accuracy
0.236
[[12  0  6  1  0  6  0  0  0  0]
 [ 5  2  5  0  0 11  0  1  1  0]
 [ 1  0 24  0  0  0  0  0  0  0]
 [ 4  2 19  0  0  0  0  0  0  0]
 [ 1  1 10  0 13  0  0  0  0  0]
 [ 3  1 13  0  0  8  0  0  0  0]
 [ 6  1 17  0  0  1  0  0  0  0]
 [ 6  1 15  0  0  3  0  0  0  0]
 [ 4  1 13  0  0  7  0  0  0  0]
 [ 6  0 15  0  0  4  0  0  0  0]]
Random Forest Classifier
Train Accuracy
0.6253333333333333
Test Accuracy
0.436
[[18  1  1  0  0  2  0  2  1  0]
 [ 2 18  5  0  0  0  0  0  0  0]
 [ 2

In [233]:
accuracy_sheet

[['body', 'Count Vectorizer', 'LR', 0.492, 0.232],
 ['body', 'Count Vectorizer', 'SVC', 0.39066666666666666, 0.236],
 ['body', 'Count Vectorizer', 'RF', 0.6253333333333333, 0.436],
 ['body', 'TFIDF Vectorizer', 'LR', 0.52, 0.324],
 ['body', 'TFIDF Vectorizer', 'SVC', 0.15466666666666667, 0.168],
 ['body', 'TFIDF Vectorizer', 'RF', 0.608, 0.328],
 ['comments', 'Count Vectorizer', 'LR', 0.52, 0.148],
 ['comments', 'Count Vectorizer', 'SVC', 0.204, 0.124],
 ['comments', 'Count Vectorizer', 'RF', 0.62, 0.18],
 ['comments', 'TFIDF Vectorizer', 'LR', 0.6826666666666666, 0.164],
 ['comments', 'TFIDF Vectorizer', 'SVC', 0.11866666666666667, 0.108],
 ['comments', 'TFIDF Vectorizer', 'RF', 0.524, 0.16],
 ['title', 'Count Vectorizer', 'LR', 0.8346666666666667, 0.696],
 ['title', 'Count Vectorizer', 'SVC', 0.46266666666666667, 0.408],
 ['title', 'Count Vectorizer', 'RF', 0.8106666666666666, 0.704],
 ['title', 'TFIDF Vectorizer', 'LR', 0.9146666666666666, 0.724],
 ['title', 'TFIDF Vectorizer', 'SVC

In [240]:
accuracy_dataframe = pd.DataFrame.from_records(accuracy_sheet)

accuracy_dataframe.columns = ['Considering',
                              'Vectorizer',
                              'Model',
                              'Train Accuracy',
                              'Test Accuracy'
                             ]

accuracy_dataframe["Train Accuracy"] = round(accuracy_dataframe["Train Accuracy"]*100,2)
accuracy_dataframe["Test Accuracy"] = round(accuracy_dataframe["Test Accuracy"]*100,2)

accuracy_dataframe

Unnamed: 0,Considering,Vectorizer,Model,Train Accuracy,Test Accuracy
0,body,Count Vectorizer,LR,49.2,23.2
1,body,Count Vectorizer,SVC,39.07,23.6
2,body,Count Vectorizer,RF,62.53,43.6
3,body,TFIDF Vectorizer,LR,52.0,32.4
4,body,TFIDF Vectorizer,SVC,15.47,16.8
5,body,TFIDF Vectorizer,RF,60.8,32.8
6,comments,Count Vectorizer,LR,52.0,14.8
7,comments,Count Vectorizer,SVC,20.4,12.4
8,comments,Count Vectorizer,RF,62.0,18.0
9,comments,TFIDF Vectorizer,LR,68.27,16.4


In [241]:
# EXPORT THE ACCURACY STATISTICS
accuracy_dataframe.to_csv("../data/accuracy_stats.csv", index=False)

---

### We see that the following models perform the best:

1. title + CV + LR 
2. title + CV + RF
3. title + TFIDF + LR
4. all params + CV + RF **(highest)**
5. all params + TFIDF + LR
6. all params + TFIDF + RF

**Hence, we export the model with highest accuracy, i.e. Model 4**

In [247]:
def cvec_RF_model_export(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    random_state=42,
                                                    stratify = y)
    # Creating the pipeline

    cvec_rf_pipe = Pipeline([("cvec", CountVectorizer()), 
                             ("rf", RandomForestClassifier(random_state = 42))])

    # Setting hyperparameters
    cvec_pipe_params = {"cvec__max_features"   : [1000], 
                        "cvec__ngram_range"    : [(1,1)], 
                        "cvec__stop_words"     : [None],
                        "rf__n_estimators"     : [72],
                        "rf__min_samples_split": [6],
                        "rf__min_samples_leaf" : [2],
                        "rf__max_depth"        : [20]}

    # grid search
    cvec_rf_gs = GridSearchCV(cvec_rf_pipe, 
                              param_grid = cvec_pipe_params, 
                              cv         = 5,
                              n_jobs     = 6)

    # Fitting the model to the training data
    cvec_rf_gs.fit(X_train, y_train);

    # Generating training predictions
    cvec_rf_train_preds = cvec_rf_gs.predict(X_train)

    # Generating test predictions
    cvec_rf_preds       = cvec_rf_gs.predict(X_test) 

    # Generating test probabilities
    cvec_rf_probas      = cvec_rf_gs.predict_proba(X_test)

    # Creating the pipeline
    cvec_rf_pipe = Pipeline([("cvec", CountVectorizer()), 
                             ("rf", RandomForestClassifier(random_state = 42))])

    # Setting CVEC and pipeline hyperparameters
    cvec_pipe_params = {"cvec__max_features"   : [1000], 
                        "cvec__ngram_range"    : [(1,1)], 
                        "cvec__stop_words"     : [None],
                        "rf__n_estimators"     : [72],
                        "rf__min_samples_split": [6],
                        "rf__min_samples_leaf" : [2],
                        "rf__max_depth"        : [20]}

    # Instantiating the grid search
    cvec_rf_gs = GridSearchCV(cvec_rf_pipe, 
                              param_grid = cvec_pipe_params, 
                              cv         = 5,
                              n_jobs     = 6)

    # Fitting the model to the training data
    cvec_rf_gs.fit(X_train, y_train);
    
    finalname = "../models/CVRF_allParams.sav"
    pickle.dump(cvec_rf_gs, open(finalname, "wb"))

In [250]:
cvec_RF(choice_X["All Params"], df1["flair"])

Train Accuracy
0.9573333333333334
Test Accuracy
0.828
[[18  2  0  0  0  1  0  2  1  1]
 [ 1 23  0  0  0  0  0  0  0  1]
 [ 1  0 21  1  0  1  0  0  0  1]
 [ 0  2  0 23  0  0  0  0  0  0]
 [ 0  0  1  0 21  0  0  0  1  2]
 [ 0  3  0  0  0 20  0  0  0  2]
 [ 0  0  1  0  0  4 18  0  0  2]
 [ 2  0  2  0  0  0  2 17  0  2]
 [ 0  2  0  0  0  1  0  0 22  0]
 [ 0  0  0  0  0  1  0  0  0 24]]


(0.9573333333333334, 0.828)

In [251]:
# EXPORTING MODEL
cvec_RF_model_export(choice_X["All Params"], df1["flair"])