# Models for Smash and Warhammer

In [57]:
#imports for modeling
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_extraction import text

In [3]:
#silence future warnings becuase they're annoying
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
#subreddits to use for model
sub1 = 'Warhammer40k'
sub2 = 'paintball'

In [5]:
#get data
data1 = pd.read_csv(sub1+'.csv')
data2 = pd.read_csv(sub2+'.csv')

#creating column indicating which sub a post is from
data1['subreddit'] = 1
data2['subreddit'] = 0
df = pd.concat([data1,data2])

#defining features and target variable
X = df['data']
y = df['subreddit']

#train test split to be used for all models
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Models
1. [Logistic Regression](#model1) Best Accuracy: 90.98% (TF-IDF)
2. [KNN](#model2) Best Accuracy: 89.29% (TF-IDF)
3. [Multinomial Naive Bayes](#model3) Best Accuracy: 93.39% (Count)

Most hyperparamters were found by experimentation in gridsearch using binary search algorithm. This method assumes that effects of a given hyperparameter are linear, which may not always be the case, but does always converge to a local optimizer regardless  

For reference:  
https://en.wikipedia.org/wiki/Binary_search_algorithm

In essence, 
- test between 2 values, that are arbitrary percieved bounds
- keep the one that gives a higher score and replace the other with the average of the 2 values
- test between previous best and average as new values
- repeat until convergence

(Didn't go into the 50's or decimals, so stopped algorithm early to keep things even)

Gridsearch was used, but it optimizes for Cross Val Score, I optimized for test set accuracy instead

### Attempt 1: Logistic Regression
<a id='model1'></a>

In [6]:
pipe = Pipeline([
    ('vec', CountVectorizer(stop_words='english')),
    ('lr', LogisticRegression())
])

In [16]:
pipe_params = {
    'vec': [CountVectorizer(stop_words='english'), TfidfVectorizer(stop_words='english')],
    'vec__max_features': [2700,2800], #tried [3000, 3100, 3200,]
    'vec__min_df': [2],
    'vec__max_df': [.9,.85],
    'vec__ngram_range': [(1,3),(1,2)],
    'lr__C': [1,2,3], #tried [1,5]
}

In [17]:
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print('cvs:', gs.best_score_)
print('train score:', gs.score(X_train, y_train))
print('test score:', gs.score(X_test, y_test))
gs.best_params_

cvs: 0.891566265060241
train score: 0.9825970548862115
test score: 0.9098196392785571


{'lr__C': 2,
 'vec': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=0.9, max_features=2700,
                 min_df=2, ngram_range=(1, 3), norm='l2', preprocessor=None,
                 smooth_idf=True, stop_words='english', strip_accents=None,
                 sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=None, use_idf=True, vocabulary=None),
 'vec__max_df': 0.9,
 'vec__max_features': 2700,
 'vec__min_df': 2,
 'vec__ngram_range': (1, 3)}

Best score on testing set was 90.98% accuracy, using TF-IDF, which is a little lower than models based on the other subreddits

### Attempt 2: k-Nearest Neighbors
<a id='model2'></a>

In [18]:
pipe = Pipeline([
    ('vec', CountVectorizer(stop_words='english')),
    ('knn', KNeighborsClassifier())
])

In [29]:
pipe_params = {
    'vec': [CountVectorizer(stop_words='english'), TfidfVectorizer(stop_words='english')],
    'vec__max_features': [3000, 3100],
    'vec__min_df': [2],
    'vec__max_df': [.9],
    'vec__ngram_range': [(1,1),(1,2)],
    'knn__n_neighbors': [35]
}

In [30]:
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print('cvs:', gs.best_score_)
print('train score:', gs.score(X_train, y_train))
print('test score:', gs.score(X_test, y_test))
gs.best_params_

cvs: 0.8714859437751004
train score: 0.8969210174029452
test score: 0.8937875751503006


{'knn__n_neighbors': 35,
 'vec': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=0.9, max_features=3000,
                 min_df=2, ngram_range=(1, 2), norm='l2', preprocessor=None,
                 smooth_idf=True, stop_words='english', strip_accents=None,
                 sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=None, use_idf=True, vocabulary=None),
 'vec__max_df': 0.9,
 'vec__max_features': 3000,
 'vec__min_df': 2,
 'vec__ngram_range': (1, 2)}

Managed to get a training and test score around 89.27% which is comparable to logistic regression, despite it performing worse with this data in comparison tot he other two subreddit combinations. 

### Attempt 3: Multinomial Naive Bayes
<a id='model3'></a>

Using Guassian is too computationally expensive as it requires a dense matrix not a sparse one. Attempting multinomial to see what happens as the matrix is sparse anyways it violates fewer assumptions

In [31]:
pipe = Pipeline([
    ('vec', CountVectorizer(stop_words='english')),
    ('nb', MultinomialNB())
])

In [42]:
pipe_params = {
    'vec': [CountVectorizer(stop_words='english'), TfidfVectorizer(stop_words='english')],
    'vec__max_features': [3500],
    'vec__min_df': [2],
    'vec__max_df': [.9],
    'vec__ngram_range': [(1,2)]
}

In [44]:
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print('cvs:', gs.best_score_)
print('train score:', gs.score(X_train, y_train))
print('test score:', gs.score(X_test, y_test))
gs.best_params_

cvs: 0.9062918340026773
train score: 0.9591700133868809
test score: 0.9338677354709419


{'vec': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                 lowercase=True, max_df=0.9, max_features=3500, min_df=2,
                 ngram_range=(1, 2), preprocessor=None, stop_words='english',
                 strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=None, vocabulary=None),
 'vec__max_df': 0.9,
 'vec__max_features': 3500,
 'vec__min_df': 2,
 'vec__ngram_range': (1, 2)}

Achieved and accuracy of 93.39%. Slightly higher cvs score was acheived with lowermax_features down to 3000, but also lowered accuracy. Count vectorizer was found to be better for once. Once again naive bayes is the most effective model