# Models for Smash and Warhammer

In [187]:
#imports for modeling
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_extraction import text

In [110]:
#silence future warnings becuase they're annoying
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [79]:
#subreddits to use for model
sub1 = 'smashbros'
sub2 = 'warhammer40k'

In [112]:
#get data
data1 = pd.read_csv(sub1+'.csv')
data2 = pd.read_csv(sub2+'.csv')

#creating column indicating which sub a post is from
data1['subreddit'] = 1
data2['subreddit'] = 0
df = pd.concat([data1,data2])

#defining features and target variable
X = df['data']
y = df['subreddit']

#train test split to be used for all models
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [188]:
custom = text.ENGLISH_STOP_WORDS.union(['amp', 'new', 'like', 'got','know','just', 've','don', 'think'])

# Models
1. [Logistic Regression](#model1) Best Accuracy: 94.78% (TF-IDF)
2. [KNN](#model2) Best Accuracy: 71.49% (TF-IDF)
3. [Multinomial Naive Bayes](#model3) Best Accuracy: 95.78% (TF-IDF)

### Attempt 1: Logistic Regression
<a id='model1'></a>

In [133]:
pipe = Pipeline([
    ('vec', CountVectorizer(stop_words='english')),
    ('lr', LogisticRegression())
])

In [142]:
pipe_params = {
    'vec': [CountVectorizer(stop_words='english'), TfidfVectorizer(stop_words='english')],
    'vec__max_features': [3300, 3400], #tried [3000, 3100, 3200,]
    'vec__min_df': [2, 3],
    'vec__max_df': [.9],
    'vec__ngram_range': [(1,3)],
    'lr__C': [6,7], #tried [1,5]
}

In [143]:
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print('cvs:', gs.best_score_)
print('train score:', gs.score(X_train, y_train))
print('test score:', gs.score(X_test, y_test))
gs.best_params_

cvs: 0.9229738780977896
train score: 0.9926322839919625
test score: 0.9477911646586346


{'lr__C': 6,
 'vec': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=0.9, max_features=3300,
                 min_df=2, ngram_range=(1, 3), norm='l2', preprocessor=None,
                 smooth_idf=True, stop_words='english', strip_accents=None,
                 sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=None, use_idf=True, vocabulary=None),
 'vec__max_df': 0.9,
 'vec__max_features': 3300,
 'vec__min_df': 2,
 'vec__ngram_range': (1, 3)}

Best score on testing set was 94.78% accuracy, using TF-IDF. Worth noting is that n-gram range of (1,3) yielded a worse cross-validation score but a better test set accuracy.

### Attempt 2: k-Nearest Neighbors
<a id='model2'></a>

In [150]:
pipe = Pipeline([
    ('vec', CountVectorizer(stop_words='english')),
    ('knn', KNeighborsClassifier())
])

In [173]:
pipe_params = {
    'vec': [CountVectorizer(stop_words='english'), TfidfVectorizer(stop_words='english')],
    'vec__max_features': [3500, 3600],
    'vec__min_df': [2],
    'vec__max_df': [.9],
    'vec__ngram_range': [(1,1)],
    'knn__n_neighbors': [35]
}

In [174]:
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print('cvs:', gs.best_score_)
print('train score:', gs.score(X_train, y_train))
print('test score:', gs.score(X_test, y_test))
gs.best_params_

cvs: 0.87073007367716
train score: 0.7273945077026122
test score: 0.714859437751004


{'knn__n_neighbors': 35,
 'vec': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=0.9, max_features=3500,
                 min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                 smooth_idf=True, stop_words='english', strip_accents=None,
                 sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=None, use_idf=True, vocabulary=None),
 'vec__max_df': 0.9,
 'vec__max_features': 3500,
 'vec__min_df': 2,
 'vec__ngram_range': (1, 1)}

Managed to get a training and test score around 71%, however, any slight deviation in the hyperparamters drops accuray to the 50s. This model is worse than linear regression is bascially everyway possible.

### Attempt 3: Multinomial Naive Bayes
<a id='model3'></a>

Using Guassian is too computationally expensive as it requires a dense matrix not a sparse one. Attempting multinomial to see what happens as the matrix is sparse anyways it violates fewer assumptions

In [175]:
pipe = Pipeline([
    ('vec', CountVectorizer(stop_words='english')),
    ('nb', MultinomialNB())
])

In [192]:
pipe_params = {
    'vec': [CountVectorizer(stop_words='english'), TfidfVectorizer(stop_words='english')],
    'vec__max_features': [3300, 3400, 3500],
    'vec__min_df': [2, 3],
    'vec__max_df': [.9],
    'vec__ngram_range': [(1,1), (1,2)]
}

In [190]:
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print('cvs:', gs.best_score_)
print('train score:', gs.score(X_train, y_train))
print('test score:', gs.score(X_test, y_test))
gs.best_params_

cvs: 0.9390488948425988
train score: 0.9886135298057602
test score: 0.9538152610441767


{'vec': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=0.9, max_features=3400,
                 min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                 smooth_idf=True,
                 stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                       'afterwards', 'again', 'against', 'all',
                                       'almost', 'alone', 'along', 'already',
                                       'also', 'although', 'always', 'am',
                                       'among', 'amongst', 'amoungst', 'amount',
                                       'amp', 'an', 'and', 'another', 'any',
                                       'anyhow', 'anyone', 'anything', 'anyway', ...}),
                 strip_accents=None, sublinear_tf=False,
                 token_pattern='(?u)\\b\\w\\w+\

Achieved and accuracy of 95.78% on the test set, which so ar is the best model. Also a very high score on the training set, potentially a little overfit