In [64]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, stop_words
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import stop_words
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier,\
ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [65]:
# Reading in the dataset
df = pd.read_csv('./Datasets/df.csv')

In [66]:
# Checking first 5 rows
df.head()

Unnamed: 0,subreddit,title,selftext
0,0,A recession IS coming.,[removed]
1,0,Whatever happened to all those people who said...,[removed]
2,0,We are In for one hell of a fucking ride gentl...,
3,0,What’s with the market today? Option prices ha...,[removed]
4,0,"Bear me out, this is just the beginning",It’s amazing to see how so many people think t...


In [67]:
df.isnull().sum()

subreddit       0
title           0
selftext     4465
dtype: int64

In [69]:
df.loc[(df['subreddit'] == 0) & (df['selftext'] == '[removed]')]

Unnamed: 0,subreddit,title,selftext
0,0,A recession IS coming.,[removed]
1,0,Whatever happened to all those people who said...,[removed]
3,0,What’s with the market today? Option prices ha...,[removed]
11,0,Serious question,[removed]
13,0,Should I buy a bunch of calls on apple before ...,[removed]
...,...,...,...
10413,0,My Options spread strategy - Easiest money maker,[removed]
10414,0,RAD stock price has doubled in a week due to i...,[removed]
10426,0,Option Noobie...,[removed]
10430,0,Tesla could close at $420.69 for a historic Ch...,[removed]


In [4]:
# Filling NaN values with removed
df.fillna('[removed]', inplace = True)

In [5]:
# Train test splits for both title and selftext columns.
# Set y to equal the subreddit
X_title = df['title']
X_selftext = df['selftext']
y = df['subreddit']
X_train_t, X_test_t, y_train, y_test = train_test_split(X_title, y, stratify = y, random_state = 42)
X_train_s, X_test_s, y_train, y_test = train_test_split(X_selftext, y, stratify = y, random_state = 42)

In [6]:
# Baseline
df['subreddit'].value_counts(normalize = True)

1    0.501219
0    0.498781
Name: subreddit, dtype: float64

# CountVectorizer Pipeline

In [7]:
# CountVectorize Pipeline with multiple parameters
pipe_C = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(solver = 'liblinear'))
])

pipe_params_C = {
    'cvec__max_features': [100, 500],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [None, 'english']
}

gs_C = GridSearchCV(pipe_C, 
                  pipe_params_C, 
                  cv = 5) 

In [8]:
# CountVectorizer - title
gs_C.fit(X_train_t, y_train);

In [9]:
# Count Vectorize - title - train score
gs_C.score(X_train_t, y_train)

0.8086407952590327

In [10]:
# Count Vectorize - title - test score
gs_C.score(X_test_t, y_test)

0.7891819571865444

In [11]:
# CountVectorizer - selftext
gs_C.fit(X_train_s, y_train);

In [12]:
# Count Vectorize - selftext - train score
gs_C.score(X_train_t, y_train)

0.6625246925380743

In [13]:
# Count Vectorize - selftext - tests score
gs_C.score(X_test_t, y_test)

0.654243119266055

# TfidfVectorizer Pipeline

In [14]:
# TfidfVecotize Pipeline with multiple parameters 
pipe_T = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('lr', LogisticRegression(solver = 'liblinear'))
])

pipe_params_T = {
    'tvec__max_features': [100, 500],
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': [None, 'english']
}

gs_T = GridSearchCV(pipe_T, 
                  pipe_params_T, 
                  cv = 5) 

In [15]:
# TfidfVectorizer - title
gs_T.fit(X_train_t, y_train);

In [16]:
# TfidfVectorizer - title - train score
gs_T.score(X_train_t, y_train)

0.8093417447269483

In [17]:
# TfidfVectorizer - title - test score
gs_T.score(X_test_t, y_test)

0.7954892966360856

In [18]:
# TfidfVectorizer - selftext
gs_T.fit(X_train_s, y_train);

In [19]:
# Best paramters for selftext TfidfVectorize
gs_T.best_params_

{'tvec__max_features': 500,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': 'english'}

In [20]:
# TfidfVectorizer - selftext - train score
gs_T.score(X_train_s, y_train)

0.8382081182692921

In [21]:
# TfidfVectorizer - selftext - test score
gs_T.score(X_test_s, y_test)

0.8157492354740061

### TfidfVectorize with the selftext column was the best model

# TfidfVectorizer(selftext) - Gaussian

In [22]:
tf = TfidfVectorizer()

In [23]:
# Fitting to Gaussian model after TfidfVectorizer fit and transform
tf.fit(X_train_s, y_train)
X_train_tf = tf.fit_transform(X_train_s).todense()
X_test_tf = tf.transform(X_test_s).todense()

gnb = GaussianNB()
gnb.fit(X_train_tf, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [24]:
# Train score
gnb.score(X_train_tf, y_train)

0.8416491429299687

In [25]:
# Test Score
gnb.score(X_test_tf, y_test)

0.7052752293577982

### Over fit

# Sentimental Analysis to help further seperate the two subreddits

In [26]:
!pip install vaderSentiment



In [27]:
# Import SentimentIntensityAnalyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [28]:
sia = SentimentIntensityAnalyzer()

In [29]:
sia.polarity_scores('[removed]')

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [30]:
# Sentiment Analysis for selftext because that was the best model
storage_sf = []
for text in df['selftext']:
    score = sia.polarity_scores(text)
    score['selftext'] = text
    storage_sf.append(score)
df_st_score = pd.DataFrame(storage_sf)
df_st_score['subreddit'] = df['subreddit']


In [31]:
# Compound stands out as a option to distingush the two subreddits
df_st_score.groupby('subreddit').mean()

Unnamed: 0_level_0,neg,neu,pos,compound
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.034009,0.920858,0.045134,0.053797
1,0.034462,0.871329,0.094018,0.340282


In [32]:
# Exporting to csv
df_st_score.to_csv('./Datasets/selftext_scores.csv', index = False)

# TfidfVectorizer - LogReg with compound sentiment

In [33]:
# Finding the top words used to understand how removing these word impact the model
custom_stop = list(ENGLISH_STOP_WORDS)

cv = CountVectorizer(stop_words=custom_stop, min_df=6)

df_features = pd.DataFrame(cv.fit_transform(df['selftext']).todense(),
                           columns = cv.get_feature_names())
df_features.head()

Unnamed: 0,00,000,001,01,015,02,03,04,05,06,...,zacks,zero,zerohedge,zesty,zoltan,zombie,zone,zones,zoom,zuckerberg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
words = df_features.sum().sort_values(ascending = False).head(10)

In [35]:
wsb = df_features.loc[df['subreddit'] == 0].copy()
inv = df_features.loc[df['subreddit'] == 1].copy()

In [36]:
wsb.sum().sort_values(ascending = False).head(10)

removed     5861
amp         2246
https       1860
com         1794
market      1374
like        1338
just        1312
stock       1308
money       1206
earnings    1190
dtype: int64

In [37]:
inv.sum().sort_values(ascending = False).head(10)

https     4221
com       3775
stock     2982
market    2905
money     2890
like      2841
just      2709
www       2568
amp       2537
year      2379
dtype: int64

In [38]:
# Train test split on the selftext and compound
X = df_st_score[['compound', 'selftext']]
y = df_st_score['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

In [39]:
tf = TfidfVectorizer(stop_words = 'english')
X_train = tf.fit_transform(X_train['selftext']).todense()
X_test = tf.transform(X_test['selftext']).todense()

In [40]:
# Applying the Logistic Regression
lr = LogisticRegression()

params = ({
    'penalty': ['l1', 'l2']
})

gs = GridSearchCV(lr, param_grid = params, cv = 5)
gs.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None, param_grid={'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [41]:
gs.score(X_train, y_train)

0.8770152297202575

In [42]:
gs.score(X_test, y_test)

0.8333333333333334

In [43]:
gs.best_params_

{'penalty': 'l2'}

In [44]:
# Checking the Senitment scores for the text
storage_ti = []
for text in df['title']:
    score = sia.polarity_scores(text)
    score['selftext'] = text
    storage_ti.append(score)
df_ti_score = pd.DataFrame(storage_ti)
df_ti_score['subreddit'] = df['subreddit']

In [45]:
# Nothing really stands out
df_ti_score.groupby('subreddit').mean()

Unnamed: 0_level_0,neg,neu,pos,compound
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.074816,0.830335,0.094177,0.020939
1,0.045295,0.854059,0.100644,0.085511


In [46]:
df_ti_score.to_csv('./Datasets/title_scores.csv', index = False)

### Stop Words impact on the model

`english, [removed]`

Accuracy
- train: 0.87465
- test: 0.83295

`english, [removed]`

Accuracy
- train: 0.87497
- test: 0.8339

`english, removed, https, com, amp, stock, market, like, money, just,` www

Accuracy
- train: 0.86924
- test: 0.82511

`english, removed, https, com, amp, stock, market, like, money, just,` earnings

Accuracy
- train: 0.87096
- test: 0.82607

`english, https, com, amp, stock, market, like, money, just,` www, year

Accuracy
- train: 0.87472
- test: 0.8335              
           
                         
                          
          

In [47]:
# Creating a title and selftext column
df['ti_st'] = df[['title', 'selftext']].apply(lambda x: ''.join(x), axis = 1)

In [48]:
# Setting as the X value and applying train test split
X = df['ti_st']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

In [49]:
# Using the pipleline that gave me the best model
pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('lr', LogisticRegression(solver = 'liblinear'))
])

pipe_params = {
    'tvec__max_features': [100, 250, 500, 750],
    'tvec__ngram_range': [(1,1), (1,2), (2,2), (1,3), (2,3)],
    'tvec__stop_words': [None, 'english']
}
gs = GridSearchCV(pipe, 
                  pipe_params, 
                  cv = 5) 

gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [50]:
gs.score(X_train, y_train)

0.8851717326196393

In [51]:
gs.score(X_test, y_test)

0.8572247706422018

In [52]:
gs.best_params_

{'tvec__max_features': 750,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': None}

In [53]:
# Checking the senitment analysis for the title and selftext column
storage_sf = []
for text in df['ti_st']:
    score = sia.polarity_scores(text)
    score['ti_st'] = text
    storage_sf.append(score)
df_final = pd.DataFrame(storage_sf)
df_final['subreddit'] = df['subreddit']

In [54]:
# Compound has similar seperation as the selftext analysis
df_final.groupby('subreddit').mean()

Unnamed: 0_level_0,neg,neu,pos,compound
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.068063,0.842542,0.089395,0.068541
1,0.042477,0.8444,0.113118,0.378547


In [55]:
# Exporting as a csv
df_final.to_csv('./Datasets/title_and_selftext.csv')

In [56]:
X = df_final[['compound', 'ti_st']]
y = df_final['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

In [57]:
tf = TfidfVectorizer(stop_words = 'english')

In [58]:
X_train_tf = tf.fit_transform(X_train['ti_st']).todense()
X_test_tf = tf.transform(X_test['ti_st']).todense()

In [59]:
lr = LogisticRegression(solver = 'liblinear')

params = ({
    'penalty': ['l1', 'l2']
})

gs = GridSearchCV(lr, param_grid = params, cv = 5)
gs.fit(X_train_tf, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None, param_grid={'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [60]:
gs.score(X_train_tf, y_train)

0.9240425667495061

In [61]:
gs.score(X_test_tf, y_test)

0.8740443425076453

In [62]:
gs.best_params_

{'penalty': 'l2'}

### This was the best model and I took it further in the production model