# Models for Smash and Warhammer

In [102]:
#imports for modeling
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_extraction import text

In [2]:
#silence future warnings becuase they're annoying
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [95]:
data1.shape

(991, 2)

In [3]:
#subreddits to use for model
sub1 = 'smashbros'
sub2 = 'paintball'

In [4]:
#get data
data1 = pd.read_csv(sub1+'.csv')
data2 = pd.read_csv(sub2+'.csv')

#creating column indicating which sub a post is from
data1['subreddit'] = 1
data2['subreddit'] = 0
df = pd.concat([data1,data2])

#defining features and target variable
X = df['data']
y = df['subreddit']

#train test split to be used for all models
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [103]:
custom = text.ENGLISH_STOP_WORDS.union(['amp', 'new', 'like', 'got','know','just', 've','don', 'think'])

# Models
1. [Logistic Regression](#model1) Best Accuracy: 91.93% (TF-IDF)
2. [KNN](#model2) Best Accuracy: 82.29% (TF-IDF)
3. [Multinomial Naive Bayes](#model3) Best Accuracy: 92.54% (TF-IDF)


### Attempt 1: Logistic Regression
<a id='model1'></a>

In [5]:
pipe = Pipeline([
    ('vec', CountVectorizer(stop_words='english')),
    ('lr', LogisticRegression())
])

In [47]:
pipe_params = {
    'vec': [CountVectorizer(stop_words='english'), TfidfVectorizer(stop_words='english')],
    'vec__max_features': [2500,3000, 3500], #tried [3000, 3100, 3200,]
    'vec__min_df': [2],
    'vec__max_df': [.8,.85],
    'vec__ngram_range': [(1,1),(1,2)],
    'lr__C': [1,5], #tried [1,5]
}

In [48]:
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print('cvs:', gs.best_score_)
print('train score:', gs.score(X_train, y_train))
print('test score:', gs.score(X_test, y_test))
gs.best_params_

cvs: 0.9422043010752689
train score: 0.9852150537634409
test score: 0.9193548387096774


{'lr__C': 1,
 'vec': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=0.8, max_features=3000,
                 min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                 smooth_idf=True, stop_words='english', strip_accents=None,
                 sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=None, use_idf=True, vocabulary=None),
 'vec__max_df': 0.8,
 'vec__max_features': 3000,
 'vec__min_df': 2,
 'vec__ngram_range': (1, 1)}

Best score on testing set was 91.93% accuracy, using TF-IDF

### Attempt 2: k-Nearest Neighbors
<a id='model2'></a>

In [49]:
pipe = Pipeline([
    ('vec', CountVectorizer(stop_words='english')),
    ('knn', KNeighborsClassifier())
])

In [69]:
pipe_params = {
    'vec': [CountVectorizer(stop_words='english'), TfidfVectorizer(stop_words='english')],
    'vec__max_features': [3200],
    'vec__min_df': [2],
    'vec__max_df': [.9],
    'vec__ngram_range': [(1,1)],
    'knn__n_neighbors': [35]
}

In [70]:
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print('cvs:', gs.best_score_)
print('train score:', gs.score(X_train, y_train))
print('test score:', gs.score(X_test, y_test))
gs.best_params_

cvs: 0.8723118279569892
train score: 0.8958333333333334
test score: 0.8629032258064516


{'knn__n_neighbors': 35,
 'vec': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=0.9, max_features=3200,
                 min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                 smooth_idf=True, stop_words='english', strip_accents=None,
                 sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=None, use_idf=True, vocabulary=None),
 'vec__max_df': 0.9,
 'vec__max_features': 3200,
 'vec__min_df': 2,
 'vec__ngram_range': (1, 1)}

Managed to get a training and test score around 82.29%, however, max features of 3100 gave a slightly higher cross val score but lower accurcy. Interestingly, scores in the lower 80's were common no matter what was changed among hypter parameters, but in the Smash and Warhammer model the accuracy was more sporadic with kNN and also lower, despite logistic regression yielding better scores

### Attempt 3: Multinomial Naive Bayes
<a id='model3'></a>

Using Guassian is too computationally expensive as it requires a dense matrix not a sparse one. Attempting multinomial to see what happens as the matrix is sparse anyways it violates fewer assumptions

In [71]:
pipe = Pipeline([
    ('vec', CountVectorizer(stop_words='english')),
    ('nb', MultinomialNB())
])

In [93]:
pipe_params = {
    'vec': [CountVectorizer(stop_words='english'), TfidfVectorizer(stop_words='english')],
    'vec__max_features': [3900, 4000],
    'vec__min_df': [2],
    'vec__max_df': [.9],
    'vec__ngram_range': [(1,2), (1,3)]
}

In [94]:
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print('cvs:', gs.best_score_)
print('train score:', gs.score(X_train, y_train))
print('test score:', gs.score(X_test, y_test))
gs.best_params_

cvs: 0.9435483870967742
train score: 0.9811827956989247
test score: 0.9254032258064516


{'vec': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=0.9, max_features=3900,
                 min_df=2, ngram_range=(1, 2), norm='l2', preprocessor=None,
                 smooth_idf=True, stop_words='english', strip_accents=None,
                 sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=None, use_idf=True, vocabulary=None),
 'vec__max_df': 0.9,
 'vec__max_features': 3900,
 'vec__min_df': 2,
 'vec__ngram_range': (1, 2)}

Achieved and accuracy of 92.54% on the test set, which so far is the best model in comparison to the other 2, but is still quite a bit worse than with the smash warhamer mode.