In [158]:
import numpy as np
import pandas as pd
import requests
import re
import math
import spacy
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.feature_extraction.text import (CountVectorizer, 
                                             TfidfVectorizer)
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.model_selection import (cross_val_score, GridSearchCV, 
                                     train_test_split)

In [159]:
# read in data
ow_cod_df = pd.read_csv('data/ow_cod_df_clean.csv')

In [160]:
len(ow_cod_df)

50491

In [70]:
nlp = spacy.load('en_core_web_md')

In [65]:
ow_cod_df.loc[:, 'spacy_lemmatized'] = ''

In [66]:
# create new column, spacy_lemmatized, which is lemmatized reviews
# leaving out aux, punct, and pron since we want to keep these original values
for i in ow_cod_df.index:
    doc = nlp(ow_cod_df['title_selftext'].loc[i])
    lemmatized = ' '.join([token.lemma_ for token in doc if token.pos_ not in ['AUX', 'PUNCT', 'PRON']])
    ow_cod_df['spacy_lemmatized'].loc[i] = lemmatized

In [None]:
# save spacy lemmatized to csv
ow_cod_df.to_csv('data/ow_cod_df_clean_lemmatized.csv', index=False)

In [3]:
ow_cod_df = pd.read_csv('data/ow_cod_df_clean_lemmatized.csv')

I will test both logistic regression models and random forest for classification as they are both widely used for their performance and are industry standards

In [4]:
# for spacy lemmatization, drop nulls
ow_cod_df.dropna(subset='spacy_lemmatized', inplace=True)
len(ow_cod_df) # 50460, dropped 31 rows

50460

In [5]:
# define X and y
X = ow_cod_df['spacy_lemmatized']
y = ow_cod_df['subreddit_ow']

In [6]:
# split X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=901, stratify=y)

In [7]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((37845,), (12615,), (37845,), (12615,))

#### Logisitic Regression

In [8]:
# Create dictionary for storing results
results_df = pd.DataFrame(columns=['model', 'params', 'train_score', 'test_score'])

In [9]:
# Pipeline for logistic regression using TFIDF
# using saga solver to accomodate l2, and no penalty options (after several iterations l2 was clearly preferred over l1 for all logistic regression
# so we only consider l2)
logreg_pipe = Pipeline([
    ('tfidf_vect', TfidfVectorizer()),
    ('lr', LogisticRegression(max_iter=10_000, random_state=901))
])

In [10]:
# testing with english stopwords and no stop words
# testing out l2, and no penalty (after several runs l2 was preferred over l1 for all logreg models)
# ideal C was above 1.0 so testing several values
params = {
    'tfidf_vect__stop_words': [None, 'english'],
    'lr__penalty': ['l2', 'none'],
    'lr__C': [2.5, 2.0, 1.5]   
}
# using gridsearch to look at all possibilities
gs_logreg_tfidf = GridSearchCV(logreg_pipe, param_grid=params, n_jobs=-1)
# fit to training data
gs_logreg_tfidf.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('tfidf_vect', TfidfVectorizer()),
                                       ('lr',
                                        LogisticRegression(max_iter=10000,
                                                           random_state=901))]),
             n_jobs=-1,
             param_grid={'lr__C': [2.5, 2.0, 1.5],
                         'lr__penalty': ['l2', 'none'],
                         'tfidf_vect__stop_words': [None, 'english']})

In [11]:
gs_logreg_tfidf.best_params_
# {'lr__C': 2.5, 'lr__penalty': 'l2', 'tfidf_vect__stop_words': 'english'}

{'lr__C': 2.5, 'lr__penalty': 'l2', 'tfidf_vect__stop_words': 'english'}

In [12]:
gs_logreg_tfidf.best_estimator_.score(X_train, y_train), gs_logreg_tfidf.best_estimator_.score(X_test, y_test)
# (0.9078874355925486, 0.8580261593341261)

(0.9078874355925486, 0.8580261593341261)

In [13]:
# save results to dataframe
# 'model', 'params', 'train_score', 'test_score'])
gs_logreg_tfidf_dict = {'model': gs_logreg_tfidf, 
                      'params': gs_logreg_tfidf.best_params_, 
                      'train_score': gs_logreg_tfidf.best_estimator_.score(X_train, y_train), 
                      'test_score': gs_logreg_tfidf.best_estimator_.score(X_test, y_test)}
results_df = results_df.append(gs_logreg_tfidf_dict, ignore_index=True)

  results_df = results_df.append(gs_logreg_tfidf_dict, ignore_index=True)


In [14]:
# trying lemmatized, countvectorizer, logreg with same params as before
logreg_pipe_cv = Pipeline([
    ('cvx', CountVectorizer()),
    ('lr', LogisticRegression(max_iter=10_000, random_state=901))
])

In [15]:
# testing with english stopwords and no stop words
# testing out l2, and no penalty
# ideal C was above 1.0 so testing several values
params = {
    'cvx__stop_words': [None, 'english'],
    'lr__penalty': ['l2', 'none'],
    'lr__C': [2.5, 2.0, 1.5]
    
}
# using gridsearch to look at all possibilities
gs_logreg_cv = GridSearchCV(logreg_pipe_cv, param_grid=params, n_jobs=-1)
# fit to training data
gs_logreg_cv.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('cvx', CountVectorizer()),
                                       ('lr',
                                        LogisticRegression(max_iter=10000,
                                                           random_state=901))]),
             n_jobs=-1,
             param_grid={'cvx__stop_words': [None, 'english'],
                         'lr__C': [2.5, 2.0, 1.5],
                         'lr__penalty': ['l2', 'none']})

In [16]:
gs_logreg_cv.best_params_
# {'cvx__stop_words': None, 'lr__C': 1.5, 'lr__penalty': 'l2'}

{'cvx__stop_words': None, 'lr__C': 1.5, 'lr__penalty': 'l2'}

In [17]:
gs_logreg_cv.best_estimator_.score(X_train, y_train), gs_logreg_cv.best_estimator_.score(X_test, y_test)
# (0.9212577619236358, 0.8608799048751486)

(0.9212577619236358, 0.8608799048751486)

In [18]:
# save results to dataframe
# 'model', 'params', 'train_score', 'test_score'])
gs_logreg_cv_dict = {'model': gs_logreg_cv, 
                      'params': gs_logreg_cv.best_params_, 
                      'train_score': gs_logreg_cv.best_estimator_.score(X_train, y_train), 
                      'test_score': gs_logreg_cv.best_estimator_.score(X_test, y_test)}
results_df = results_df.append(gs_logreg_cv_dict, ignore_index=True)

  results_df = results_df.append(gs_logreg_cv_dict, ignore_index=True)


In [19]:
results_df.head(2)

Unnamed: 0,model,params,train_score,test_score
0,GridSearchCV(estimator=Pipeline(steps=[('tfidf...,"{'lr__C': 2.5, 'lr__penalty': 'l2', 'tfidf_vec...",0.907887,0.858026
1,"GridSearchCV(estimator=Pipeline(steps=[('cvx',...","{'cvx__stop_words': None, 'lr__C': 1.5, 'lr__p...",0.921258,0.86088


We take a look at the best model from above but on non-lemmatized X:

In [20]:
# define X and y
X = ow_cod_df['title_selftext']
y = ow_cod_df['subreddit_ow']

In [21]:
# split X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=901, stratify=y)

In [22]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((37845,), (12615,), (37845,), (12615,))

In [23]:
logreg_pipe_cv_nonlemmatized = Pipeline([
    ('cvx', CountVectorizer()),
    ('lr', LogisticRegression(max_iter=10_000, random_state=901, C=1.5, penalty='l2'))
])

In [24]:
# fit best model (logreg, cv) without lemmatization
logreg_pipe_cv_nonlemmatized.fit(X_train, y_train)

Pipeline(steps=[('cvx', CountVectorizer()),
                ('lr',
                 LogisticRegression(C=1.5, max_iter=10000, random_state=901))])

In [25]:
logreg_pipe_cv_nonlemmatized.score(X_train, y_train), logreg_pipe_cv_nonlemmatized.score(X_test, y_test)
# (0.9336239926014005, 0.8608799048751486)

(0.9336239926014005, 0.8608799048751486)

Since the non-lemmatized version scored slightly worse on logistic regression for the testing set and it is similar anyway, we will use lemmatization for the remainder of the modeling

#### Random Forest

In [30]:
# define X and y
X = ow_cod_df['spacy_lemmatized']
y = ow_cod_df['subreddit_ow']

In [31]:
# split X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=901, stratify=y)

In [32]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((37845,), (12615,), (37845,), (12615,))

In [65]:
# fit random forest with tfidf using grid search
rf_tfidf_pipe = Pipeline([
    ('tfidf_vect', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

In [66]:
# several options of max_depth and other parameters were explored for the rf but the default performed the best
params = {
    'tfidf_vect__stop_words': [None, 'english']}
gs_rf_tfidf = GridSearchCV(rf_tfidf_pipe, param_grid=params, n_jobs=-1)
gs_rf_tfidf.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('tfidf_vect', TfidfVectorizer()),
                                       ('rf', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'tfidf_vect__stop_words': [None, 'english']})

In [67]:
gs_rf_tfidf.best_params_
# {'tfidf_vect__stop_words': 'english'}

{'tfidf_vect__stop_words': 'english'}

In [68]:
gs_rf_tfidf.best_estimator_.score(X_train, y_train), gs_rf_tfidf.best_estimator_.score(X_test, y_test)
# (0.9891399128022196, 0.8462148236226714)

(0.9891399128022196, 0.8462148236226714)

In [69]:
# save results to dataframe
# 'model', 'params', 'train_score', 'test_score'
gs_rf_tfidf_dict = {'model': gs_rf_tfidf, 
                      'params': gs_rf_tfidf.best_params_, 
                      'train_score': gs_rf_tfidf.best_estimator_.score(X_train, y_train), 
                      'test_score': gs_rf_tfidf.best_estimator_.score(X_test, y_test)}
results_df = results_df.append(gs_rf_tfidf_dict, ignore_index=True)

  results_df = results_df.append(gs_rf_tfidf_dict, ignore_index=True)


In [70]:
# fit random forest with cv using grid search
rf_tfidf_pipe = Pipeline([
    ('cvx', CountVectorizer()),
    ('rf', RandomForestClassifier())
])

In [71]:
# several options of max_depth and other parameters were explored for the rf but the default performed the best (on the test set)
params = {
    'cvx__stop_words': [None, 'english']
}
gs_rf_cv = GridSearchCV(rf_tfidf_pipe, param_grid=params, n_jobs=-1)
gs_rf_cv.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('cvx', CountVectorizer()),
                                       ('rf', RandomForestClassifier())]),
             n_jobs=-1, param_grid={'cvx__stop_words': [None, 'english']})

In [72]:
gs_rf_cv.best_params_
# {'cvx__stop_words': 'english'}

{'cvx__stop_words': 'english'}

In [73]:
gs_rf_cv.best_estimator_.score(X_train, y_train), gs_rf_cv.best_estimator_.score(X_test, y_test)
# (0.989272030651341, 0.8390804597701149)

(0.989272030651341, 0.8390804597701149)

In [74]:
# save results to dataframe
# 'model', 'params', 'train_score', 'test_score'
gs_rf_cv_dict = {'model': gs_rf_cv, 
                      'params': gs_rf_cv.best_params_, 
                      'train_score': gs_rf_cv.best_estimator_.score(X_train, y_train), 
                      'test_score': gs_rf_cv.best_estimator_.score(X_test, y_test)}
results_df = results_df.append(gs_rf_cv_dict, ignore_index=True)

  results_df = results_df.append(gs_rf_cv_dict, ignore_index=True)


In [75]:
results_df

Unnamed: 0,model,params,train_score,test_score
0,GridSearchCV(estimator=Pipeline(steps=[('tfidf...,"{'lr__C': 2.5, 'lr__penalty': 'l2', 'tfidf_vec...",0.907887,0.858026
1,"GridSearchCV(estimator=Pipeline(steps=[('cvx',...","{'cvx__stop_words': None, 'lr__C': 1.5, 'lr__p...",0.921258,0.86088
2,GridSearchCV(estimator=Pipeline(steps=[('tfidf...,"{'rf__max_depth': 25, 'tfidf_vect__stop_words'...",0.729581,0.721681
3,"GridSearchCV(estimator=Pipeline(steps=[('cvx',...","{'cvx__stop_words': 'english', 'rf__max_depth'...",0.732091,0.725961
4,GridSearchCV(estimator=Pipeline(steps=[('tfidf...,{'tfidf_vect__stop_words': 'english'},0.98914,0.846215
5,"GridSearchCV(estimator=Pipeline(steps=[('cvx',...",{'cvx__stop_words': 'english'},0.989272,0.83908


After looking at results, we see that the logisic regression with CountVectorizer performed the best. We continue to tune this model:

In [76]:
# trying lemmatized, countvectorizer, logreg with same params as before
logreg_pipe_cv = Pipeline([
    ('cvx', CountVectorizer()),
    ('lr', LogisticRegression(max_iter=10_000, random_state=901))
])

In [79]:
# testing with english stopwords and no stop words
# testing out l2, and no penalty
# ideal C was above 1.0 so testing several values
params = {
    'cvx__stop_words': [None],
    'lr__penalty': ['l2'],
    'lr__C': [1.2, 1.3, 1.4, 1.5, 1.6, 1.7]
    
}
# using gridsearch to look at all possibilities
gs_logreg_cv = GridSearchCV(logreg_pipe_cv, param_grid=params, n_jobs=-1)
# fit to training data
gs_logreg_cv.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('cvx', CountVectorizer()),
                                       ('lr',
                                        LogisticRegression(max_iter=10000,
                                                           random_state=901))]),
             n_jobs=-1,
             param_grid={'cvx__stop_words': [None],
                         'lr__C': [1.2, 1.3, 1.4, 1.5, 1.6, 1.7],
                         'lr__penalty': ['l2']})

In [80]:
gs_logreg_cv.best_params_
# {'cvx__stop_words': None, 'lr__C': 1.3, 'lr__penalty': 'l2'}

{'cvx__stop_words': None, 'lr__C': 1.3, 'lr__penalty': 'l2'}

In [81]:
gs_logreg_cv.best_estimator_.score(X_train, y_train), gs_logreg_cv.best_estimator_.score(X_test, y_test)
# (0.9177170035671819, 0.8613555291319858)

(0.9177170035671819, 0.8613555291319858)

In [82]:
# save results to dataframe
# 'model', 'params', 'train_score', 'test_score'
gs_logreg_cv_dict_2 = {'model': gs_logreg_cv, 
                      'params': gs_logreg_cv.best_params_, 
                      'train_score': gs_logreg_cv.best_estimator_.score(X_train, y_train), 
                      'test_score': gs_logreg_cv.best_estimator_.score(X_test, y_test)}
results_df = results_df.append(gs_logreg_cv_dict_2, ignore_index=True)

  results_df = results_df.append(gs_logreg_cv_dict_2, ignore_index=True)


In [83]:
results_df

Unnamed: 0,model,params,train_score,test_score
0,GridSearchCV(estimator=Pipeline(steps=[('tfidf...,"{'lr__C': 2.5, 'lr__penalty': 'l2', 'tfidf_vec...",0.907887,0.858026
1,"GridSearchCV(estimator=Pipeline(steps=[('cvx',...","{'cvx__stop_words': None, 'lr__C': 1.5, 'lr__p...",0.921258,0.86088
2,GridSearchCV(estimator=Pipeline(steps=[('tfidf...,"{'rf__max_depth': 25, 'tfidf_vect__stop_words'...",0.729581,0.721681
3,"GridSearchCV(estimator=Pipeline(steps=[('cvx',...","{'cvx__stop_words': 'english', 'rf__max_depth'...",0.732091,0.725961
4,GridSearchCV(estimator=Pipeline(steps=[('tfidf...,{'tfidf_vect__stop_words': 'english'},0.98914,0.846215
5,"GridSearchCV(estimator=Pipeline(steps=[('cvx',...",{'cvx__stop_words': 'english'},0.989272,0.83908
6,"GridSearchCV(estimator=Pipeline(steps=[('cvx',...","{'cvx__stop_words': None, 'lr__C': 1.3, 'lr__p...",0.917717,0.861356


#### Final Model

Our final model is a logistic regression with:
* no stop words
* penalty l2 (ridge)
* C = 1.3

The model performance was:
* training: ~ 0.918
* test: ~ 0.861

The baseline for our model is the most frequent occurence which is ~0.66, so our model performs better than the baseline

In [87]:
ow_cod_df['subreddit_ow'].value_counts(normalize=True).to_frame()

Unnamed: 0,subreddit_ow
0,0.65979
1,0.34021


#### Model Interpretation

First, we know that based on how the trianing score was much higher than our test score, the model we have is overfitted. However, a test score ofd 0.861 is good so we still keep this as our best model.
Next, we look at coefficients in our model for interpretation:

In [94]:
# defining just one case of model
logreg_pipe_cv = Pipeline([
    ('cvx', CountVectorizer(stop_words=None)),
    ('lr', LogisticRegression(max_iter=10_000, random_state=901, C=1.3, penalty='l2'))
])

In [95]:
logreg_pipe_cv.fit(X_train, y_train)

Pipeline(steps=[('cvx', CountVectorizer()),
                ('lr',
                 LogisticRegression(C=1.3, max_iter=10000, random_state=901))])

In [96]:
gs_logreg_cv.score(X_train, y_train), gs_logreg_cv.score(X_test, y_test)

(0.9177170035671819, 0.8613555291319858)

In [122]:
logreg_coefs = pd.DataFrame(zip(gs_logreg_cv.best_estimator_.named_steps['cvx'].get_feature_names_out(), np.transpose(gs_logreg_cv.best_estimator_.named_steps['lr'].coef_[0])), columns=['features', 'coef']) 

# source: https://stackoverflow.com/questions/57924484/finding-coefficients-for-logistic-regression-in-python

In [124]:
logreg_coefs['abs_coef'] = abs(logreg_coefs['coef'])

In [134]:
# create dataframe of most common words
cv_unigrams = CountVectorizer(max_features=10000, stop_words='english')
unigrams = cv_unigrams.fit_transform(ow_cod_df['spacy_lemmatized'])
unigrams = pd.DataFrame(unigrams.todense(), columns=cv_unigrams.get_feature_names_out())

unigrams = unigrams.sum().sort_values(ascending = False).to_frame()

In [143]:
unigrams = unigrams.join(logreg_coefs.set_index('features')).rename(columns={0: 'frequency'}).copy()

In [145]:
unigrams['importance'] = unigrams['frequency'] * unigrams['abs_coef']

In [156]:
unigrams.sort_values(by='importance', ascending=False).head(30)

Unnamed: 0,frequency,coef,abs_coef,importance
weapon,4198,-1.737435,1.737435,7293.750456
gun,3922,-1.758101,1.758101,6895.271494
tank,2404,2.570142,2.570142,6178.621558
play,17381,0.322415,0.322415,5603.887647
support,3165,1.681789,1.681789,5322.861228
rank,2827,1.800907,1.800907,5091.163829
skin,4416,1.071068,1.071068,4729.837153
comp,1315,3.584932,3.584932,4714.185602
miplayer,1625,-2.669387,2.669387,4337.753111
game,27527,-0.14748,0.14748,4059.692512


The table above gives the coefficient weighted by the frequency of a word as a heuristic for word importance in classification. Words with negative coefficients are words used to classify COD MW2 posts and positive ones for Overwatch 2.

As examples for interpretation:
* A one-word increase in occurance of "weapon" in a post means that the post being for Overwatch 2 subreddit is is ~0.18 times as likely
* A one-word increase in occurance of "tank" in a post means that the post being for Overwatch 2 subreddit is is 13 times as likely

(note: since we removed the occurence of "ult", the word "multiplayer" turned into "miplayer")

From the table, we can see that:

1. The following words are more commonly associated with Overwatch 2 subreddit posts:
* tank
* support
* rank
* comp/ competative
* role
* heal


2. The following words are more commonly associated with COD MW2 subreddit posts:
* multiplayer
* loadout
* activision
* gun
* weapon

For conclusions/remarks, please see readme