In [1]:
#imports

import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score

In [2]:
#read in cleaned data

df = pd.read_csv('../Data/cleaned.csv')

In [3]:
df.head()

Unnamed: 0,title,subreddit
0,women of reddit when you are in a relationship...,1
1,do you have a hot take what is it,1
2,why do you cry so much over little things,1
3,as an adult how do you make more female friends,1
4,which 3rd party reddit app do you use and what...,1


In [4]:
df.shape

(90000, 2)

In [5]:
#baseline score

df['subreddit'].value_counts(normalize=True)

0    0.5
1    0.5
Name: subreddit, dtype: float64

In [6]:
#create stop words to use in modeling

stop_words =['men', 'women', 'reddit', 'guy', 'guys', 'woman', 'man', 'girl', 'girls', 'ladies', 'like', 'just', 've' ]

stop_english = stopwords.words('english')

stop = stop_words + stop_english

In [None]:
#try count vectorizer on tokenized titles. use gridsearch to find best params for count vectorizer

In [8]:
#set variables for modeling

X = df['title']

y = df['subreddit']

In [9]:
#train test split for model evaluation

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [10]:
#build pipeline for gridsearch
#got idea from 5.03 nlp notes

pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', BernoulliNB())
])

In [11]:
pipe_params = {
    'cvec__max_features':[1_000, 3_000, 5_000, 7_000],
    'cvec__stop_words'  :[None, 'english', stop],
    'cvec__ngram_range' :[(1,1), (1,2), (2,2)]
}

In [12]:
gs = GridSearchCV(pipe, param_grid = pipe_params, cv = 5)

In [13]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('nb', BernoulliNB())]),
             param_grid={'cvec__max_features': [1000, 3000, 5000, 7000],
                         'cvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'cvec__stop_words': [None, 'english',
                                              ['men', 'women', 'reddit', 'guy',
                                               'guys', 'woman', 'man', 'girl',
                                               'girls', 'ladies', 'like',
                                               'just', 've', 'i', 'me', 'my',
                                               'myself', 'we', 'our', 'ours',
                                               'ourselves', 'you', "you're",
                                               "you've", "you'll", "you'd",
                                               'your', 'yours', 'yourself',
                       

In [14]:
print(f'training score:  {gs.score(X_train, y_train)}')

print(f'testing score: {gs.score(X_test, y_test)}')

training score:  0.7428888888888889
testing score: 0.7175111111111111


In [15]:
gs.best_params_

{'cvec__max_features': 7000,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None}

In [None]:
# This model is pretty decent. 
# The training and testing scores are similar, showing that the model performs similarly on seen and unseen data. 
# I was surprised that the best parameter for stop words was none. The ngram range makes sense for how people speak online.
# Next I will explore other NLP tecniques or tweaking features to see if I can improve on this anymore.

In [64]:
#add column in df to try lemmatizer

In [9]:
# Instantiate lemmatizer. 
lemmatizer = WordNetLemmatizer()

In [10]:
df['title_lem'] =  [list(i.split()) for i in df['title'].tolist() ]

df['title_lem'] =[ ' '.join([lemmatizer.lemmatize(word) for word in row ]) for row in df['title_lem'] ] 

In [None]:
#add column in df to try stem

In [11]:
p_stemmer = PorterStemmer()

In [12]:
df['title_stem'] =  [list(i.split()) for i in df['title'].tolist() ]

df['title_stem'] =[ ' '.join([p_stemmer.stem(word) for word in row ]) for row in df['title_stem'] ] 

In [13]:
df.head()

Unnamed: 0,title,subreddit,title_lem,title_stem
0,women of reddit when you are in a relationship...,1,woman of reddit when you are in a relationship...,women of reddit when you are in a relationship...
1,do you have a hot take what is it,1,do you have a hot take what is it,do you have a hot take what is it
2,why do you cry so much over little things,1,why do you cry so much over little thing,whi do you cri so much over littl thing
3,as an adult how do you make more female friends,1,a an adult how do you make more female friend,as an adult how do you make more femal friend
4,which 3rd party reddit app do you use and what...,1,which 3rd party reddit app do you use and what...,which 3rd parti reddit app do you use and what...


In [None]:
#try cvec with lemmatize

In [75]:
#set variables for modeling with lemmatizer

X = df['title_lem']

y = df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [76]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', BernoulliNB())
])

pipe_params = {
    'cvec__max_features':[3_000, 5_000, 7_000, 10_000],
    'cvec__stop_words'  :[None, 'english', stop],
    'cvec__ngram_range' :[(1,1), (1,2), (2,2)]
}

gs = GridSearchCV(pipe, param_grid = pipe_params, cv = 5)

In [77]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('nb', BernoulliNB())]),
             param_grid={'cvec__max_features': [3000, 5000, 7000, 10000],
                         'cvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'cvec__stop_words': [None, 'english',
                                              ['men', 'women', 'reddit', 'guy',
                                               'guys', 'woman', 'man', 'girl',
                                               'girls', 'ladies', 'like',
                                               'just', 've', 'i', 'me', 'my',
                                               'myself', 'we', 'our', 'ours',
                                               'ourselves', 'you', "you're",
                                               "you've", "you'll", "you'd",
                                               'your', 'yours', 'yourself',
                      

In [78]:
print(f'training score:  {gs.score(X_train, y_train)}')

print(f'testing score: {gs.score(X_test, y_test)}')

training score:  0.7474814814814815
testing score: 0.7199111111111111


In [79]:
gs.best_params_

{'cvec__max_features': 10000,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None}

In [None]:
# This model scores almost exactly the same as the one before, showing that lemmatizing does not make a big difference.

In [None]:
#try cvec with stemming. use grisearch to find best parameters

In [80]:
#set variables for modeling with stemming

X = df['title_stem']

y = df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [81]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', BernoulliNB())
])

pipe_params = {
    'cvec__max_features':[5_000, 7_000, 10_000],
    'cvec__stop_words'  :[None, 'english', stop],
    'cvec__ngram_range' :[(1,1), (1,2), (2,2)]
}

gs = GridSearchCV(pipe, param_grid = pipe_params, cv = 5)

In [82]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('nb', BernoulliNB())]),
             param_grid={'cvec__max_features': [5000, 7000, 10000],
                         'cvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'cvec__stop_words': [None, 'english',
                                              ['men', 'women', 'reddit', 'guy',
                                               'guys', 'woman', 'man', 'girl',
                                               'girls', 'ladies', 'like',
                                               'just', 've', 'i', 'me', 'my',
                                               'myself', 'we', 'our', 'ours',
                                               'ourselves', 'you', "you're",
                                               "you've", "you'll", "you'd",
                                               'your', 'yours', 'yourself',
                            

In [83]:
print(f'training score:  {gs.score(X_train, y_train)}')

print(f'testing score: {gs.score(X_test, y_test)}')

training score:  0.7506814814814815
testing score: 0.7220888888888889


In [84]:
gs.best_params_

{'cvec__max_features': 10000,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None}

In [None]:
# This model scores similarly to the two above, just ever so slightly higher. 
# Stemming does not seem to make a significant difference for the data in this model.

In [None]:
# All count vectorizer models with tokenized, lemmatized, and stemmed titles perform pretty similarly. 
# They are all scoring similarly and slightly overfit (around .74 for training scores and around .71 for testing scores)
# The scores are better than the baseline.

In [7]:
#try tfidf with stemmimg. use grisearch to find best parameters

In [21]:
#set variables for modeling with stemming

X = df['title_stem']

y = df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [22]:
pipe = Pipeline([
    ('tvec', TfidfVectorizer() ),
    ('nb', MultinomialNB() )
])

pipe_params = {
    'tvec__max_features':[5_000, 7_000, 10_000],
    'tvec__stop_words'  :[None, 'english', stop],
    'tvec__ngram_range' :[(1,1), (1,2)]
}

gs = GridSearchCV(pipe, param_grid = pipe_params, cv = 5)

In [23]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             param_grid={'tvec__max_features': [5000, 7000, 10000],
                         'tvec__ngram_range': [(1, 1), (1, 2)],
                         'tvec__stop_words': [None, 'english',
                                              ['men', 'women', 'reddit', 'guy',
                                               'guys', 'woman', 'man', 'girl',
                                               'girls', 'ladies', 'like',
                                               'just', 've', 'i', 'me', 'my',
                                               'myself', 'we', 'our', 'ours',
                                               'ourselves', 'you', "you're",
                                               "you've", "you'll", "you'd",
                                               'your', 'yours', 'yourself',
                                  

In [24]:
print(f'training score:  {gs.score(X_train, y_train)}')

print(f'testing score: {gs.score(X_test, y_test)}')

training score:  0.7577481481481482
testing score: 0.7177777777777777


In [25]:
gs.best_params_

{'tvec__max_features': 10000,
 'tvec__ngram_range': (1, 2),
 'tvec__stop_words': None}

In [None]:
# TF-IDF with multinomial nb scores similarly to models above too. 
# This is still scoring better than the baseline.