In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from scipy import stats

import statsmodels.formula.api as smf
import lxml

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
# pd.options.display.float_format = '{:.4f}'.format

In [2]:
def get_data():
    """method to get the data from CSVs"""

    training_data = pd.read_csv("../raw_data/dreaddit-train.csv")
    test_data = pd.read_csv("../raw_data/dreaddit-test.csv")

    #print('success')

    return training_data, test_data

In [3]:
def clean_data(df):
    cols_to_remove = ['syntax_ari',
                   'syntax_fk_grade',
                   'lex_dal_max_pleasantness',
                   'lex_dal_max_activation',
                   'lex_dal_max_imagery',
                   'lex_dal_min_pleasantness',
                   'lex_dal_min_activation',
                   'lex_dal_min_imagery',
                   'lex_dal_avg_activation',
                   'lex_dal_avg_imagery',
                   'lex_dal_avg_pleasantness',
                   'sentiment',
                   'post_id',
                   'social_timestamp',
                   'subreddit',
                   'sentence_range']
    df.drop(columns=cols_to_remove, inplace=True)
    
    rows_to_remove = df['text'].loc[df['text'].apply(lambda x: len(x)<35)].index.to_list()
    
    df.drop(labels=rows_to_remove, inplace=True)
    
    df['pct_caps'] = df['text'].apply(lambda x: sum([char.isupper() for char in x])/len(x))
    df['text'] = df['text'].apply(lambda x: x.lower())
    
    df.set_index('id', inplace=True)
    
    df.drop(columns=['text'], inplace=True) #keeping this separate if anything changes and we need the text
    
    #adding the capping on social variables (social_karma, 'social_num_comments', 
    df['social_karma'] = df['social_karma'].apply(lambda x: 200 if x>200 else x)
    df['social_num_comments'] = df['social_num_comments'].apply(lambda x: 100 if x>100 else x)
    
    return df

In [39]:
training_data.pop('label')

0       1
1       0
2       1
3       1
4       1
       ..
2833    0
2834    1
2835    0
2836    0
2837    1
Name: label, Length: 2838, dtype: int64

In [41]:
training_data.shape

(2838, 115)

In [43]:
training_data.shape

(2838, 116)

In [42]:
training_data, test_data =  get_data()

In [5]:
clean_train = clean_data(training_data)
clean_test = clean_data(test_data)

In [6]:
clean_train.shape, clean_test.shape

((2833, 99), (715, 99))

### Preproc pipeline and feature selection

## Feature Selection

In [7]:
from sklearn.feature_selection import mutual_info_classif
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [8]:
def calc_MI(df_clean):
    
    features = df_clean.drop(columns=['label','confidence'])
    MI = mutual_info_classif(features, df_clean['label'])
    headers = features.columns
    MI_vars = pd.Series(index=headers, data=MI).sort_values(ascending=False)
    MI_vars_selected = MI_vars[MI_vars>0]
    
    df_post_MI  = df_clean[MI_vars_selected.index.to_list()]
    
    return df_post_MI

In [9]:
def calc_vif(df_post_MI):
    vif = pd.DataFrame()
    vif['variables'] = df_post_MI.columns
    vif['VIF'] = [variance_inflation_factor(df_post_MI, i) for i in range(df_post_MI.shape[1])]
    
    VIF_df = vif[vif['VIF']<30].reset_index(drop=True)
    
    return VIF_df['variables'].to_list()

In [10]:
selected_features = calc_vif(calc_MI(clean_train))

In [12]:
preproc_pipe = Pipeline([
    ('feature_selector', ColumnTransformer([
        ('feature_selector', 'passthrough', selected_features)
    ], remainder='drop')),
    
    ('scaling', MinMaxScaler())
])

In [13]:
X_train = preproc_pipe.fit_transform(clean_train)

In [14]:
X_test = preproc_pipe.transform(clean_test)

In [15]:
# df_X_train_scaled = pd.DataFrame(columns = selected_features,
#                                  data = X_train)
# df_X_train_scaled.head(3)

In [16]:
# df_X_test_scaled = pd.DataFrame(columns = selected_features,
#                                 data = X_test)
# df_X_test_scaled.head(3)

In [17]:
#df_X_train_scaled.shape, df_X_test_scaled.shape, len(selected_features)

In [18]:
clean_train.head(2)

Unnamed: 0_level_0,label,confidence,social_karma,lex_liwc_WC,lex_liwc_Analytic,lex_liwc_Clout,lex_liwc_Authentic,lex_liwc_Tone,lex_liwc_WPS,lex_liwc_Sixltr,lex_liwc_Dic,lex_liwc_function,lex_liwc_pronoun,lex_liwc_ppron,lex_liwc_i,lex_liwc_we,lex_liwc_you,lex_liwc_shehe,lex_liwc_they,lex_liwc_ipron,lex_liwc_article,lex_liwc_prep,lex_liwc_auxverb,lex_liwc_adverb,lex_liwc_conj,lex_liwc_negate,lex_liwc_verb,lex_liwc_adj,lex_liwc_compare,lex_liwc_interrog,lex_liwc_number,lex_liwc_quant,lex_liwc_affect,lex_liwc_posemo,lex_liwc_negemo,lex_liwc_anx,lex_liwc_anger,lex_liwc_sad,lex_liwc_social,lex_liwc_family,lex_liwc_friend,lex_liwc_female,lex_liwc_male,lex_liwc_cogproc,lex_liwc_insight,lex_liwc_cause,lex_liwc_discrep,lex_liwc_tentat,lex_liwc_certain,lex_liwc_differ,lex_liwc_percept,lex_liwc_see,lex_liwc_hear,lex_liwc_feel,lex_liwc_bio,lex_liwc_body,lex_liwc_health,lex_liwc_sexual,lex_liwc_ingest,lex_liwc_drives,lex_liwc_affiliation,lex_liwc_achieve,lex_liwc_power,lex_liwc_reward,lex_liwc_risk,lex_liwc_focuspast,lex_liwc_focuspresent,lex_liwc_focusfuture,lex_liwc_relativ,lex_liwc_motion,lex_liwc_space,lex_liwc_time,lex_liwc_work,lex_liwc_leisure,lex_liwc_home,lex_liwc_money,lex_liwc_relig,lex_liwc_death,lex_liwc_informal,lex_liwc_swear,lex_liwc_netspeak,lex_liwc_assent,lex_liwc_nonflu,lex_liwc_filler,lex_liwc_AllPunc,lex_liwc_Period,lex_liwc_Comma,lex_liwc_Colon,lex_liwc_SemiC,lex_liwc_QMark,lex_liwc_Exclam,lex_liwc_Dash,lex_liwc_Quote,lex_liwc_Apostro,lex_liwc_Parenth,lex_liwc_OtherP,social_upvote_ratio,social_num_comments,pct_caps
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1
33181,1,0.8,5,116,72.64,15.04,89.26,1.0,29.0,12.93,87.07,56.03,16.38,12.07,9.48,0.0,0.86,1.72,0.0,4.31,3.45,19.83,7.76,5.17,4.31,1.72,16.38,6.03,3.45,0.86,1.72,1.72,8.62,1.72,6.9,0.86,2.59,3.45,3.45,0.0,0.0,0.0,1.72,11.21,3.45,0.86,2.59,5.17,0.0,2.59,6.03,1.72,1.72,1.72,2.59,0.86,1.72,0.0,0.0,8.62,0.0,1.72,4.31,0.86,2.59,4.31,11.21,0.86,17.24,0.86,10.34,6.03,0.86,0.0,0.0,0.0,2.59,0.0,0.86,0.86,0.0,0.0,0.0,0.0,21.55,9.48,3.45,0.86,0.86,0.0,0.0,0.0,5.17,1.72,0.0,0.0,0.86,1,0.084063
2606,0,1.0,4,109,79.08,76.85,56.75,98.18,27.25,21.1,87.16,48.62,11.93,7.34,1.83,2.75,2.75,0.0,0.0,4.59,8.26,13.76,6.42,3.67,8.26,0.92,15.6,2.75,0.92,0.92,2.75,0.92,5.5,5.5,0.0,0.0,0.0,0.0,11.01,0.0,0.0,0.0,0.0,11.93,1.83,0.0,3.67,5.5,1.83,6.42,0.92,0.92,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.6,5.5,3.67,7.34,2.75,0.0,0.92,13.76,0.92,15.6,2.75,10.09,1.83,11.01,0.0,0.0,0.92,0.0,0.0,1.83,0.0,0.92,0.0,0.0,0.0,14.68,4.59,2.75,0.0,0.0,0.0,0.0,0.0,0.0,2.75,0.92,3.67,0.65,2,0.037415


### Testing various ML models

In [19]:
#Train and test data set 

y_train = clean_train['label']

y_test = clean_test['label']

### Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
log_model = LogisticRegression(max_iter=1_000)

In [21]:
y_pred = log_model.fit(X_train,y_train).predict(X_test)

In [22]:
log_model.score(X_test,y_test)

0.7398601398601399

In [27]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
grid_log_reg = {'penalty': ['none'], #['l1', 'l2', 'elasticnet', 'none'],
                #'C':[0, 0.0001, 0.01, 1, 10, 100, 1000],
                'solver':['newton-cg', 'newton-cg', 'lbfgs'] }  #, 'sag', 'saga']}

log_search = RandomizedSearchCV(log_model, grid_log_reg, n_jobs=-1, cv=10, verbose=0, scoring='accuracy' )
log_search.fit(X_train,y_train, sample_weight = clean_train['confidence']) 



RandomizedSearchCV(cv=10, estimator=LogisticRegression(max_iter=1000),
                   n_jobs=-1,
                   param_distributions={'penalty': ['none'],
                                        'solver': ['newton-cg', 'newton-cg',
                                                   'lbfgs']},
                   scoring='accuracy')

In [28]:
log_search.best_estimator_

LogisticRegression(max_iter=1000, penalty='none', solver='newton-cg')

In [25]:
log_search.best_score_

0.7451351216841686

In [26]:
log_search.score(X_test, y_test)

0.737062937062937

In [None]:
#Best model from the Grid Search
#LogisticRegression(C=1e-05, max_iter=1000, penalty='none', solver='newton-cg')
#0.7580419580419581

In [None]:
# from sklearn.feature_selection import SequentialFeatureSelector
# selector = SequentialFeatureSelector(LogisticRegression(max_iter=1_000), cv=10, n_jobs=-1, direction='backward',
#                                     n_features_to_select=0.6)
# selector.fit(X_train, y_train)
# selector.transform(X_train)
# len(selector.get_feature_names_out())
# X_train_selected = pd.DataFrame(columns = selector.get_feature_names_out(),
#                                 index = X_train.index,
#                                 data = selector.transform(X_train))
# X_test_selected = X_test[selector.get_feature_names_out()]

### SVC

In [None]:
from sklearn.svm import SVC

SVC_model = SVC()

gird_SVC = {'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
            'C':stats.uniform(0.01,10),
            'gamma':stats.loguniform(0.01,10)}

In [None]:
SVC_search = RandomizedSearchCV(SVC_model, gird_SVC, n_jobs=-1, cv=10, verbose=0, scoring='accuracy')
SVC_search.fit(X_train,y_train, sample_weight = clean_train['confidence']) 

In [None]:
SVC_search.best_estimator_

In [None]:
SVC_search.best_score_

In [None]:
SVC_search.score(X_test, y_test)

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_model = KNeighborsClassifier()
grid_knn = {'n_neighbors':[2,3,4,5,6,7,8,9,10,12,15,20,25]}

In [None]:
knn_search =  RandomizedSearchCV(knn_model, grid_knn, cv=10, scoring='accuracy', verbose=1 )

In [None]:
knn_search.fit(X_train,y_train) 

In [None]:
knn_search.best_estimator_

In [None]:
knn_search.best_score_

In [None]:
knn_search.score(X_test, y_test)

### XGBoost

In [None]:
from xgboost import XGBClassifier
model_xgb = XGBClassifier(use_label_encoder=False) #eval_metric='error')

In [None]:
grid_xgboost = {'n_estimators':range(10,300,10) ,
                'max_depth':[2,4,6,8,10,12,14,16,18,20],
                'learning_rate': stats.loguniform(0.01,0.5),
                'booster':['gbtree', 'gblinear', 'dart']
               }

search_xgboost = RandomizedSearchCV(model_xgb, grid_xgboost, cv=5,
                                    scoring='accuracy', verbose=1,n_jobs=-1)



In [None]:
search_xgboost.fit(X_train,y_train)

In [None]:
search_xgboost.best_score_

In [None]:
search_xgboost.best_estimator_

In [None]:
search_xgboost.score(X_test, y_test)

### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier  #,GradientBoostingClassifier
ada_model = AdaBoostClassifier() #searching of the best estimator

In [None]:
gird_ada = {'base_estimator':[LogisticRegression(C=1000, max_iter=1000, penalty='none', solver='newton-cg')],  # GradientBoostingClassifier()],
            'learning_rate': stats.loguniform(0.01,0.5)}

In [None]:
search_ada = RandomizedSearchCV(ada_model, gird_ada, cv=10, scoring='accuracy', verbose=0 )

In [None]:
search_ada.fit(X_train, y_train)

In [None]:
search_ada.best_score_

In [None]:
search_ada.best_estimator_

In [None]:
search_ada.score(X_test, y_test)

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb_model = GradientBoostingClassifier()


In [None]:
gird_gb = {'n_estimators':range(10,300,10),
           'learning_rate': stats.loguniform(0.01,0.5)}

In [None]:
search_gb = RandomizedSearchCV(gb_model, gird_gb, cv=10, scoring='accuracy', verbose=1, n_jobs=-1, refit=True )

In [None]:
search_gb.fit(X_train, y_train)

In [None]:
search_gb.best_score_

In [None]:
search_gb.best_estimator_

In [None]:
search_gb.score(X_test, y_test)

### Stacking Classifiers

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier

ensamble = StackingClassifier( 
    estimators=[('log_reg',log_search.best_estimator_), 
                ('svc', SVC_search.best_estimator_), 
                ('gb', search_gb.best_estimator_)], n_jobs=-1)
    

In [None]:
from sklearn.model_selection import cross_val_score
ensamble_score = cross_val_score(ensamble, X_train, y_train, cv=10, scoring='accuracy', n_jobs=-1)

In [None]:
ensamble_score.mean()

In [None]:
ensamble.fit(X_train, y_train)

In [None]:
ensamble.score(X_test, y_test)

In [None]:
from sklearn.ensemble import BaggingClassifier

weak_learner = LogisticRegression() #log_search.best_estimator_
bagged_model = BaggingClassifier(weak_learner, n_estimators=30, verbose=0)

bagged_model.fit(X_train, y_train)
bagged_model.score(X_test, y_test)