In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

from nltk.corpus import stopwords 
import regex as re
from bs4 import BeautifulSoup  

%matplotlib inline

In [154]:
df_reddit = pd.read_csv('Datasets/reddit_cleaned_title_and_selftext.csv')

In [155]:
df_reddit.head()

Unnamed: 0,author,id,num_comments,score,created_utc,selftext,title,subreddit,char_count_title,word_count_title,char_count_selftext,word_count_selftext,title + selftext,clean_title,clean_selftext,clean_title_+_selftext,neg,neu,pos,compound
0,nothanksbud5,g5rffm,1,2,1587516214,Wow i didn’t realize how much music is about b...,Why is almost all music seem to be about love?,0,46,10,219,36,Why is almost all music seem to be about love?...,almost music seem love,wow realize much music love romance seems like...,almost music seem love wow realize much music ...,0.08,0.301,0.619,0.9777
1,dontknowwhattdo,g5r7z2,3,2,1587515419,I thought that during this time it would be ni...,pieces of advice that have stuck with you?,0,42,8,285,55,pieces of advice that have stuck with you? I t...,pieces advice stuck,thought time would nice hear words encourageme...,pieces advice stuck thought time would nice he...,0.091,0.383,0.526,0.9657
2,sharkfinnsouphk,g5r5q2,2,0,1587515173,I just can't shake this worry about kids (and ...,Worried about people stuck at home,0,34,6,269,50,Worried about people stuck at home I just can'...,worried people stuck home,shake worry kids adults stuck home lock sexual...,worried people stuck home shake worry kids adu...,0.467,0.456,0.077,-0.8924
3,dehlen1me,g5r3t3,0,1,1587514972,https://youtu.be/9_AWrNmcMZA\nThis is one of t...,How a 5 Dollar bill can help you to feel bette...,0,62,13,179,24,How a 5 Dollar bill can help you to feel bette...,dollar bill help feel better,https youtu awrnmcmza one amazing uplifting vi...,dollar bill help feel better https youtu awrnm...,0.0,0.63,0.37,0.8555
4,fighterpilot909,g5qtjo,2,2,1587513886,Imagine how insane that book would be. To make...,I want an autobiography from John McAfee so badly,0,49,9,144,28,I want an autobiography from John McAfee so ba...,want autobiography john mcafee badly,imagine insane book would make even better cou...,want autobiography john mcafee badly imagine i...,0.215,0.63,0.156,-0.3818


In [156]:
df_reddit.shape

(21825, 20)

In [157]:
df_reddit.isnull().sum()

author                     0
id                         0
num_comments               0
score                      0
created_utc                0
selftext                   0
title                      0
subreddit                  0
char_count_title           0
word_count_title           0
char_count_selftext        0
word_count_selftext        0
title + selftext           0
clean_title               73
clean_selftext             7
clean_title_+_selftext     1
neg                        0
neu                        0
pos                        0
compound                   0
dtype: int64

In [158]:
df_reddit['subreddit'].value_counts(normalize=True)

0    0.578923
1    0.421077
Name: subreddit, dtype: float64

In [159]:
df_reddit.dropna(inplace=True)

In [160]:
df_reddit['subreddit'].value_counts(normalize=True)

0    0.579003
1    0.420997
Name: subreddit, dtype: float64

## Testing on Titles

In [22]:
X = df_reddit['clean_title']
y = df_reddit['subreddit']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    stratify=y,
                                                    random_state=42)

In [24]:
y_test.value_counts(normalize=True)

0    0.578996
1    0.421004
Name: subreddit, dtype: float64

In [25]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression())
])

In [32]:
pipe_params = {
    'cvec__max_features': [2000, 3000, 4000, 5000],
    'cvec__min_df': [1, 2],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2), (2,2)]
}

In [33]:
gs = GridSearchCV(pipe,
                  pipe_params,
                  cv=5)

In [34]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [35]:
gs.best_score_

0.723219923849684

In [36]:
gs.score(X_test, y_test)

0.7342284347986022

## Testing on Clean Selftext

In [37]:
X = df_reddit['clean_selftext']
y = df_reddit['subreddit']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    stratify=y,
                                                    random_state=42)

In [43]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(solver='liblinear'))
])

In [44]:
pipe_params = {
    'cvec__max_features': [2000, 3000, 4000, 5000],
    'cvec__min_df': [1, 2],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2), (2,2)]
}

In [45]:
gs = GridSearchCV(pipe,
                  pipe_params,
                  cv=5)

In [46]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [47]:
gs.best_score_

0.7771783320369616

In [48]:
gs.score(X_test, y_test)

0.7855434982527129

In [49]:
gs.best_params_

{'cvec__max_df': 0.9,
 'cvec__max_features': 2000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2)}

## Testing on Clean Title + Selftext

In [50]:
X = df_reddit['clean_title_+_selftext']
y = df_reddit['subreddit']

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    stratify=y,
                                                    random_state=42)

In [52]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(solver='liblinear'))
])

In [53]:
# Going to use the best params from selftext
pipe_params = {
    'cvec__max_features': [2000],
    'cvec__min_df': [2],
    'cvec__max_df': [.9],
    'cvec__ngram_range': [(1,2)]
}

In [54]:
gs = GridSearchCV(pipe,
                  pipe_params,
                  cv=5)

In [55]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [56]:
gs.best_score_

0.7850882294158469

In [60]:
gs.score(X_train, y_train)

0.864124103255871

In [61]:
gs.score(X_test, y_test)

0.7980503954386611

In [62]:
lr_model = gs.estimator

In [63]:
lr_model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('lr',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                           

In [64]:
lr_model.score(X_train, y_train)

0.9850389355570544

In [65]:
lr_model.score(X_test, y_test)

0.7984182453558948

## Trying out Naive Bayes

In [66]:
X = df_reddit['clean_title_+_selftext']
y = df_reddit['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Instantiate CountVectorizer.
cv = CountVectorizer()

# Transform training and testing data based on the fit CountVectorizer.
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [70]:
# Instantiate Multinomial Naive Bayes model.
mnb = MultinomialNB()

# Fit model.
mnb.fit(X_train_cv, y_train)

# Evaluate predictions.
mnb.score(X_train_cv, y_train)


0.8684162119075357

In [71]:
mnb.score(X_test_cv, y_test)

0.8199374655140702

### Trying out the Clean Title + Selftext differently

In [143]:
X = df_reddit['clean_title_+_selftext']
y = df_reddit['subreddit']

In [144]:
# Create train_test_split.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.25,
                                                    stratify=y,
                                                    random_state = 42)

In [145]:
#Instantiating CountVectorizer 
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 2000,
                             ngram_range= (1,2),
                             min_df = 2) 

In [146]:
X_train_vectorizer = vectorizer.fit_transform(X_train)

X_test_vectorizer = vectorizer.transform(X_test)


In [147]:
# Instantiate logistic regression model.
lr = LogisticRegression(solver = 'liblinear')
# Fit model to training data.
lr.fit(X_train_vectorizer, y_train)

# Evaluate model on training data.
lr.score(X_train_vectorizer, y_train)

0.864124103255871

In [148]:
lr.score(X_test_vectorizer, y_test)

0.7980503954386611

In [120]:
X = df_reddit['title + selftext']
y = df_reddit['subreddit']

In [125]:
#Instantiating CountVectorizer 
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = 'english',
                             max_features = 2000,
                             ngram_range= (1,2)) 

In [126]:
X_train_vectorizer = vectorizer.fit_transform(X_train)

X_test_vectorizer = vectorizer.transform(X_test)


In [127]:
# Instantiate logistic regression model.
lr = LogisticRegression(solver = 'liblinear')
# Fit model to training data.
lr.fit(X_train_vectorizer, y_train)

# Evaluate model on training data.
lr.score(X_train_vectorizer, y_train)

0.9321846833036973

In [128]:
lr.score(X_test_vectorizer, y_test)

0.7914290969284532

## Trying out a model with word count and sentiment analysis included

In [161]:
df_reddit.columns

Index(['author', 'id', 'num_comments', 'score', 'created_utc', 'selftext',
       'title', 'subreddit', 'char_count_title', 'word_count_title',
       'char_count_selftext', 'word_count_selftext', 'title + selftext',
       'clean_title', 'clean_selftext', 'clean_title_+_selftext', 'neg', 'neu',
       'pos', 'compound'],
      dtype='object')

In [163]:
X = df_reddit[['word_count_title', 'word_count_selftext', 'clean_title_+_selftext', 'neg', 'neu',
       'pos', 'compound']]
y = df_reddit['subreddit']

In [165]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.25, 
                                                    random_state = 42, 
                                                    stratify = y)

In [166]:
X_train.shape, X_test.shape

((16309, 7), (5437, 7))

In [167]:
X_train.head()

Unnamed: 0,word_count_title,word_count_selftext,clean_title_+_selftext,neg,neu,pos,compound
19518,17,233,person excited spend extra time spouse possibl...,0.028,0.592,0.38,0.9941
21027,9,103,yeah due corona work hi guys live sound engine...,0.198,0.658,0.144,-0.4567
8544,13,542,people underestimate effect culture idea men w...,0.242,0.623,0.135,-0.9854
426,19,195,secretly hoping corona virus takes massive tol...,0.227,0.636,0.136,-0.9153
17529,16,90,despite media says things revert back pre covi...,0.219,0.593,0.188,-0.3191


In [168]:
# instantiate countvectorize
cvec = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 2000,
                             ngram_range= (1,2),
                             min_df = 2) 

# transform X_train and X_test
X_train_cv = cvec.fit_transform(X_train['clean_title_+_selftext'])
X_test_cv = cvec.transform(X_test['clean_title_+_selftext'])

In [169]:
# you need to use .todense() to transform your sparse matrix to a matrix sklearn can use in a model
X_train_df = pd.DataFrame(X_train_cv.todense(), columns= cvec.get_feature_names(), index=X_train.index)

# create X_testing df for model
X_test_df = pd.DataFrame(X_test_cv.todense(), columns= cvec.get_feature_names(), index=X_test.index)

In [170]:
X_train_df.head()

Unnamed: 0,ability,able,able get,absolute,absolutely,abuse,abused,abusive,accept,accepted,...,years old,yes,yesterday,yet,yo,young,younger,youtube,youtube com,zero
19518,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21027,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8544,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17529,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [171]:
X_train_df = pd.merge(left = X_train_df,
                     right = X_train[['neg', 
                                      'neu', 
                                      'pos', 
                                      'compound',
                                      'word_count_title', 
                                      'word_count_selftext']],
                     left_index = True,
                     right_index = True)

X_train_df.head()

Unnamed: 0,ability,able,able get,absolute,absolutely,abuse,abused,abusive,accept,accepted,...,younger,youtube,youtube com,zero,neg,neu,pos,compound,word_count_title,word_count_selftext
19518,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0.028,0.592,0.38,0.9941,17,233
21027,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.198,0.658,0.144,-0.4567,9,103
8544,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.242,0.623,0.135,-0.9854,13,542
426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.227,0.636,0.136,-0.9153,19,195
17529,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.219,0.593,0.188,-0.3191,16,90


In [172]:
X_test_df = pd.merge(left = X_test_df,
                    right = X_test[['neg', 
                                      'neu', 
                                      'pos', 
                                      'compound',
                                      'word_count_title', 
                                      'word_count_selftext']],
                    left_index = True,
                    right_index = True)

In [173]:
X_train_df.shape, y_train.shape

((16309, 2006), (16309,))

In [174]:
X_test_df.shape, y_test.shape

((5437, 2006), (5437,))

In [183]:
# instantiate LogisticRegression
lr = LogisticRegression(solver='liblinear', penalty='l1')

# fit to the training data
lr.fit(X_train_df, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [184]:
# check score for train and test
print('train:', lr.score(X_train_df, y_train))
print('test:', lr.score(X_test_df, y_test))

train: 0.8672512109877981
test: 0.8057752437005702
