# Models for Smash and Warhammer

In [40]:
#imports for modeling
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_extraction import text

In [2]:
#silence future warnings becuase they're annoying
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
#subreddits to use for model
sub1 = 'Warhammer40k'
sub2 = 'smashbros'
sub3 = 'paintball'

In [4]:
#get data
data1 = pd.read_csv(sub1+'.csv')
data2 = pd.read_csv(sub2+'.csv')
data3 = pd.read_csv(sub3+'.csv')

#creating column indicating which sub a post is from
data1['subreddit'] = 1
data2['subreddit'] = 0
data3['subreddit'] = 2
df = pd.concat([data1,data2,data3])

#defining features and target variable
X = df['data']
y = df['subreddit']

#train test split to be used for all models
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Models
1. [Logistic Regression](#model1) Best Accuracy: 90.02% (TF-IDF)
2. [KNN](#model2) Best Accuracy: 34.05% (TF-IDF)
3. [Multinomial Naive Bayes](#model3) Best Accuracy: 90.02% (TF-IDF)

### Hyperparamter Tuning
Most hyperparamters were found by experimentation in gridsearch using bisection search algorithm. This method assumes that effects of a given hyperparameter are linear, which may not always be the case, but does always converge to a local optimizer regardless  

reference: https://en.wikipedia.org/wiki/Bisection_method

In essence, 
- test between 2 values, that are arbitrary percieved bounds
- keep the one that gives a higher score and replace the other with the average of the 2 values
- test between previous best and average as new values
- repeat until convergence

To make the model more robust I tested 3 values at each iteration, due to uncertainity about effects of each parameter on the model: low, high, and mean

(Didn't go into the 50's or decimals, so stopped algorithm early to keep things even)

### Using TF-IDF with Multinomial

Justification for using TF-IDF with multinomial naive bayes:
“The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.” https://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/![image.png](attachment:image.png)

Using Guassian Naive Bayes requires a full matrix as opposed to a sparse matrix and the conversion from the vectorizer output to a dataframe was too computationally expensive

In [61]:
#from Pedram
custom = text.ENGLISH_STOP_WORDS.union(['amp', 'new', 'like', 'got','know','just', 've','don', 'think'])

### Attempt 1: Logistic Regression
<a id='model1'></a>

In [6]:
pipe = Pipeline([
    ('vec', CountVectorizer()),
    ('lr', LogisticRegression())
])

In [7]:
pipe_params = {
    'vec': [CountVectorizer(), TfidfVectorizer()],
    'vec__stop_words': ['english',custom]
    'vec__max_features': [2500,3000, 3500], #tried [3000, 3100, 3200,]
    'vec__min_df': [2],
    'vec__max_df': [.85],
    'vec__ngram_range': [(1,1),(1,2)],
    'lr__C': [1,5], #tried [1,5]
}

In [8]:
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print('cvs:', gs.best_score_)
print('train score:', gs.score(X_train, y_train))
print('test score:', gs.score(X_test, y_test))
gs.best_params_

cvs: 0.8802502234137622
train score: 0.9705093833780161
test score: 0.8994638069705094


{'lr__C': 1,
 'vec': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=0.85, max_features=3500,
                 min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                 smooth_idf=True,
                 stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                       'afterwards', 'again', 'against', 'all',
                                       'almost', 'alone', 'along', 'already',
                                       'also', 'although', 'always', 'am',
                                       'among', 'amongst', 'amoungst', 'amount',
                                       'amp', 'an', 'and', 'another', 'any',
                                       'anyhow', 'anyone', 'anything', 'anyway', ...}),
                 strip_accents=None, sublinear_tf=False,
                 token_pattern='(

Best score on testing set was 90.03% accuracy, using custom stop words. Changing stop words cause less improvement with this model than naive bayes

### Attempt 2: k-Nearest Neighbors
<a id='model2'></a>

In [9]:
pipe = Pipeline([
    ('vec', CountVectorizer(stop_words='english')),
    ('knn', KNeighborsClassifier())
])

In [10]:
pipe_params = {
    'vec': [CountVectorizer(stop_words='english'), TfidfVectorizer(stop_words='english')],
    'vec__max_features': [3000, 3100],
    'vec__min_df': [2],
    'vec__max_df': [.9],
    'vec__ngram_range': [(1,1),(1,2)],
    'knn__n_neighbors': [35]
}

In [11]:
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print('cvs:', gs.best_score_)
print('train score:', gs.score(X_train, y_train))
print('test score:', gs.score(X_test, y_test))
gs.best_params_

cvs: 0.7479892761394102
train score: 0.34405719392314565
test score: 0.34048257372654156


{'knn__n_neighbors': 35,
 'vec': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=0.9, max_features=3100,
                 min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                 smooth_idf=True, stop_words='english', strip_accents=None,
                 sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=None, use_idf=True, vocabulary=None),
 'vec__max_df': 0.9,
 'vec__max_features': 3100,
 'vec__min_df': 2,
 'vec__ngram_range': (1, 1)}

Pretty terrible in comparison to the other models

### Attempt 3: Multinomial Naive Bayes
<a id='model3'></a>

Using Guassian is too computationally expensive as it requires a dense matrix not a sparse one. Attempting multinomial to see what happens as the matrix is sparse anyways it violates fewer assumptions

In [30]:
pipe = Pipeline([
    ('vec', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [72]:
pipe_params = {
    'vec': [CountVectorizer(), TfidfVectorizer()],
    'vec__stop_words': [custom, 'english'],
    'vec__max_features': [4200],
    'vec__min_df': [1, 2],
    'vec__max_df': [.9],
    'vec__ngram_range': [(1,1)],
    'nb__alpha': [1,3,5]
}

In [71]:
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print('cvs:', gs.best_score_)
print('train score:', gs.score(X_train, y_train))
print('test score:', gs.score(X_test, y_test))
gs.best_params_

cvs: 0.8927613941018767
train score: 0.9700625558534406
test score: 0.9075067024128687


{'nb__alpha': 1,
 'vec': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=0.9, max_features=4200,
                 min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                 smooth_idf=True,
                 stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                       'afterwards', 'again', 'against', 'all',
                                       'almost', 'alone', 'along', 'already',
                                       'also', 'although', 'always', 'am',
                                       'among', 'amongst', 'amoungst', 'amount',
                                       'amp', 'an', 'and', 'another', 'any',
                                       'anyhow', 'anyone', 'anything', 'anyway', ...}),
                 strip_accents=None, sublinear_tf=False,
                 token_pattern

Best score after adjusting stop words was 90.75%

### Bayesian Optimization
Attempting to use bayesian hyper paramter optimization to improve score further with less guessing and checking.  
https://en.wikipedia.org/wiki/Hyperparameter_optimization#Bayesian_optimization

Using this guide: https://www.districtdatalabs.com/parameter-tuning-with-hyperopt

Didn't exactly work

In [28]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [52]:
def test(params):
    pipe = Pipeline([
        ('vec', TfidfVectorizer(stop_words='english',max_df=.9, ngram_range=(1,1))),
        ('nb', MultinomialNB())
        ])
    return cross_val_score(pipe, X_train, y_train, cv=5).mean()

space = {
    'vec__max_features': hp.choice('vec_max_features',range(2000, 8000)),
    'vec__min_df': hp.choice('vec__min_df', range(1,4)),
    'nb_alpha': hp.choice('nb_alpha', range(1,80))
}

def f(params):
    acc = test(params)
    return {'loss': -acc, 'status': STATUS_OK}

trials = Trials()
best = fmin(f, space, algo=tpe.suggest, max_evals=300, trials=trials)
print(best)

100%|██████████| 300/300 [06:04<00:00,  1.75s/it, best loss: -0.9039269734739535]
{'nb_alpha': 67, 'vec__min_df': 2, 'vec_max_features': 1918}


In [54]:
pipe = Pipeline([
    ('vec', CountVectorizer(stop_words=custom, max_df=.9, min_df=2, ngram_range=(1,1), max_features= 1918)),
    ('nb', MultinomialNB(alpha=67))
])
pipe.fit(X_train, y_train)
print('train score:', pipe.score(X_train, y_train))
print('test score:', pipe.score(X_test, y_test))

train score: 0.8480786416443253
test score: 0.8378016085790885


Several attempts didn't really seem help the model at all, so back to guess and check

# Interpreting the model
Looking at coefficients to see which words are the most indicative of a post belonging in a certain subreddit

### Using Naive Bayes

In [15]:
tf = TfidfVectorizer(stop_words=custom, max_features=4000, min_df=2, max_df=.9, ngram_range=(1,1))
tf.fit(X_train)
X_df = tf.transform(X_train)

In [16]:
nb = MultinomialNB()
nb.fit(X_df, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
smash = pd.DataFrame()
smash['features'] = tf.get_feature_names()
smash['prob'] = nb.feature_log_prob_[0]
smash.sort_values(by='prob',ascending=False, inplace=True)
smash.head(n=8)

Unnamed: 0,features,prob
3200,smash,-5.093695
1732,https,-5.620131
3705,ultimate,-5.70928
787,com,-5.798964
1487,game,-6.067114
587,bros,-6.159111
702,character,-6.220956
3622,tournament,-6.274434


In [18]:
w40k = pd.DataFrame()
w40k['features'] = tf.get_feature_names()
w40k['prob'] = nb.feature_log_prob_[1]
w40k.sort_values(by='prob',ascending=False, inplace=True)
w40k.head(n=8)

Unnamed: 0,features,prob
1385,finished,-5.744091
380,army,-5.782549
3886,welcome,-5.942247
3913,wip,-5.948089
698,chaos,-5.988201
865,contrast,-6.07229
2693,primaris,-6.077437
2492,paint,-6.113726


In [19]:
paint = pd.DataFrame()
paint['features'] = tf.get_feature_names()
paint['prob'] = nb.feature_log_prob_[2]
paint.sort_values(by='prob',ascending=False, inplace=True)
paint.head(n=8)

Unnamed: 0,features,prob
2493,paintball,-5.191856
1605,gun,-5.785647
3478,tank,-5.94643
2169,marker,-5.949233
2084,looking,-6.027609
1553,good,-6.267589
461,barrel,-6.280737
1157,dye,-6.321951


# Posts that were predicted wrong

In [20]:
tf = TfidfVectorizer(stop_words=custom, max_features=4000, min_df=2, max_df=.9, ngram_range=(1,1))
tf.fit(X_train)
X_df = tf.transform(X_train)

In [21]:
nb = MultinomialNB()
nb.fit(X_df, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [22]:
X_unk = tf.transform(X_test)

In [23]:
pred = nb.predict(X_unk)

In [24]:
wrong = pd.DataFrame(X_test[y_test != pred])

In [25]:
for i in wrong.index:
    print(wrong['data'][i])

Freeflow Caliber Minis- July 5th only: 
Help identifying this please: My best friend passed away from cancer in october and I took some vacation time to help clean up his room. He left this tank to me in his will and I’m not sure what game system it’s from. It’s in a hovering base and it’s definitely made of metal. Any help would be appreciated.https://imgur.com/gallery/N2wMIRq
Scout tank, ready to outflank!: 
I'm pretty sure this is the fastest comeback i've ever made: 
217    After a hiatus (school, work, laziness), here ...
217         Tetanus shots , tetanus shots for everyone: 
Name: data, dtype: object
First video and 3rd time playing! Ready to play more!: 
217    After a hiatus (school, work, laziness), here ...
217         Tetanus shots , tetanus shots for everyone: 
Name: data, dtype: object
PSI Shinespike: 
Worth it to put havoks in a rhino?: 
Wall off retirement: 
Installed dual Fans on my CMD: 
What's this piece of art called, and where can I find the full thing digitally?: