In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import stop_words
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier,\
                                ExtraTreesClassifier, AdaBoostClassifier,\
                                GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, roc_auc_score                            

In [2]:
# Reading in the dataset of selftext & title
df = pd.read_csv('./Datasets/df.csv')

In [3]:
# Replace NaN with [removed]
df.fillna('[removed]', inplace = True)

# Merge title and selftext
df['title_selftext'] = df[['title', 'selftext']].apply(lambda x: ''.join(x), axis = 1)

In [None]:
# Use vadersentiment to determine the negative, neutral, positive, and compound for the title and selftext
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
storage_sf = []
for text in df['title_selftext']:
    score = sia.polarity_scores(text)
    score['title_selftext'] = text
    storage_sf.append(score)
df_final = pd.DataFrame(storage_sf)
df_final['subreddit'] = df['subreddit']

In [None]:
# Grouping by subreddits
df_final.groupby('subreddit').mean()

In [8]:
# Set the X & y and seperatate through train_test_split
X = df_final[['compound', 'title_selftext']]
y = df_final['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

In [9]:
# Using TfidfVectorize to fit and train the title and self text. Also using the stop_words = 'english'
# This was the best result from the EDA and Modeling
tf = TfidfVectorizer(stop_words = 'english')
X_train_tf = tf.fit_transform(X_train['title_selftext']).todense()
X_test_tf = tf.transform(X_test['title_selftext']).todense()

In [10]:
# Use Logistic Regression to create a model under both penalties and multiple C values
lr = LogisticRegression(solver = 'liblinear')

params = ({
    'C': [1, 2.5, 5, 7.5, 10],
    'penalty': ['l1', 'l2']
})

gs = GridSearchCV(lr, param_grid = params, cv = 5)
gs.fit(X_train_tf, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 2.5, 5, 7.5, 10], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [11]:
# Training Score
gs.score(X_train_tf, y_train)

0.9466003950806092

In [12]:
# Testing Score
gs.score(X_test_tf, y_test)

0.8792048929663608

In [13]:
# Best parameters 
gs.best_params_

{'C': 2.5, 'penalty': 'l2'}

In [14]:
# Finding the probability that the results will be 1
y_score_proba = [i[1] for i in gs.predict_proba(X_test_tf)]

In [15]:
# Making sure the values were correct
y_score_proba

[0.05771217992127215,
 0.9336195785459218,
 0.878626183530915,
 0.9722331671338049,
 0.9875401399480769,
 0.010548676024082432,
 0.05172570703757097,
 0.41590999705687276,
 0.9732931376203348,
 0.0006427716273510172,
 0.9965331144110487,
 0.9748645562151972,
 0.9967602990507223,
 0.9143311789619304,
 0.9761847917112629,
 0.31707998004802024,
 0.0987614838734503,
 0.7223112620223895,
 0.02956729127148435,
 0.38096627140681916,
 0.9439665536239364,
 0.7153636430939979,
 0.8460958489568108,
 0.8188856125291847,
 0.3602897160768578,
 0.8747830206743711,
 0.006210162145941955,
 0.5203090229229452,
 0.9382292468909836,
 0.06964354831397038,
 0.55666088892899,
 0.03275148703605472,
 0.5443177065951319,
 0.9249461463800188,
 0.031482878786882613,
 0.8756452593194161,
 0.43240198867295765,
 0.8758784376702919,
 0.7943440046568223,
 0.907182508749629,
 0.9422679981031746,
 0.9848080167223419,
 0.7193464321485823,
 0.3025912823618392,
 0.9770242243867036,
 0.8826726658989145,
 0.5859279073637252,

In [16]:
# Predicting y values
preds = gs.predict(X_test_tf)

# Creating a confusion matrix dataframe 
pd.DataFrame(confusion_matrix(y_test, preds))

In [18]:
# Finding the roc_auc_score
roc_auc_score(y_test, y_score_proba)

0.9452213220874943

In [None]:
# The model is overfit so work needs to be done. 
# After checking some of the selftexts and title that were misslabeled, the ones that were misslabeled could have been posted to both subreddits.


# Removing [removed]

In [3]:
df.fillna('[removed]', inplace = True)

df.drop(df.loc[df['selftext'] == '[removed]'].index, inplace = True)

In [4]:
df['subreddit'].value_counts(normalize=True)

1    0.66285
0    0.33715
Name: subreddit, dtype: float64

In [5]:
df['title_selftext'] = df[['title', 'selftext']].apply(lambda x: ''.join(x), axis = 1)

In [6]:
df.dropna()
df.reset_index(inplace = True)

In [7]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
storage_sf = []
for text in df['title_selftext']:
    score = sia.polarity_scores(text)
    score['title_selftext'] = text
    storage_sf.append(score)
df_final = pd.DataFrame(storage_sf)
df_final['subreddit'] = df['subreddit']

In [8]:
# Compound seperation is okay
# Negative became a much bigger player
df_final.groupby('subreddit').mean()

Unnamed: 0_level_0,neg,neu,pos,compound
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.079152,0.819031,0.101819,0.122919
1,0.044324,0.842458,0.113214,0.415796


In [9]:
X = df_final[['compound', 'neg', 'title_selftext']]
y = df_final['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

In [10]:
tf = TfidfVectorizer(stop_words = 'english')
X_train_tf = tf.fit_transform(X_train['title_selftext']).todense()
X_test_tf = tf.transform(X_test['title_selftext']).todense()

In [11]:
lr = LogisticRegression(solver = 'liblinear')

params = ({
    'C': [1, 2.5, 5, 7.5, 10],
    'penalty': ['l1', 'l2']
})

gs = GridSearchCV(lr, param_grid = params, cv = 5)
gs.fit(X_train_tf, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 2.5, 5, 7.5, 10], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [12]:
gs.score(X_train_tf, y_train)

0.9689793521871024

In [13]:
gs.score(X_test_tf, y_test)

0.885529791605518

In [14]:
gs.best_params_

{'C': 5, 'penalty': 'l2'}

In [15]:
# Predicting y values
preds = gs.predict(X_test_tf)

# Creating a confusion matrix dataframe 
pd.DataFrame(confusion_matrix(y_test, preds))

Unnamed: 0,0,1
0,883,266
1,124,2134


In [16]:
y_score_proba = [i[1] for i in gs.predict_proba(X_test_tf)]
roc_auc_score(y_test, y_score_proba)

0.9443552794782076