In [1]:
#imports

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#read in cleaned data

df = pd.read_csv('../Data/cleaned.csv')

df.head()

Unnamed: 0,title,subreddit
0,women of reddit when you are in a relationship...,1
1,do you have a hot take what is it,1
2,why do you cry so much over little things,1
3,as an adult how do you make more female friends,1
4,which 3rd party reddit app do you use and what...,1


In [8]:
#baseline score

df['subreddit'].value_counts(normalize=True)

0    0.5
1    0.5
Name: subreddit, dtype: float64

In [3]:
#set variables for modeling with random forrest classifier

X = df['title']

y = df['subreddit']

#train test split for model evaluation

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [28]:
#build pipeline for random gridsearch

pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('rf', RandomForestClassifier() )
])

In [38]:
pipe_params = {
    'cvec__max_features':[5_000, 7_000, 10_000],
    'cvec__stop_words'  :[None, 'english'],
    'cvec__ngram_range' :[(1,1), (1,2)],
    'rf__n_estimators': [100, 150, 200],
    'rf__max_depth'   : [None, 1, 3, 5]
}

In [39]:
rs = RandomizedSearchCV(estimator = pipe,
                     param_distributions = pipe_params,
                     n_jobs = -1,
                     cv = 5)

In [40]:
rs.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                             ('rf', RandomForestClassifier())]),
                   n_jobs=-1,
                   param_distributions={'cvec__max_features': [5000, 7000,
                                                               10000],
                                        'cvec__ngram_range': [(1, 1), (1, 2)],
                                        'cvec__stop_words': [None, 'english'],
                                        'rf__max_depth': [None, 1, 3, 5],
                                        'rf__n_estimators': [100, 150, 200]})

In [44]:
print(f'training score:  {rs.score(X_train, y_train)}')

print(f'testing score: {rs.score(X_test, y_test)}')

training score:  0.9919111111111111
testing score: 0.7221333333333333


In [43]:
rs.best_params_

{'rf__n_estimators': 200,
 'rf__max_depth': None,
 'cvec__stop_words': None,
 'cvec__ngram_range': (1, 1),
 'cvec__max_features': 10000}

In [None]:
# This is an extremely overfit model on my training data. Not performing as well on unseen data. 
# The best parameters it selected are interesting to note. No max depth and no stop words perform best. 
# This will not be my production model because I am not happy with how overfit it is.

In [None]:
# Try extra trees classifer below to see if adding any extra randomization helps

In [4]:
#set variables for modeling

X = df['title']

y = df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [5]:
cvec = CountVectorizer(max_features = 10_000)

In [6]:
X_train = cvec.fit_transform(X_train)

X_test = cvec.transform(X_test)

In [8]:
et = ExtraTreesClassifier(n_estimators=200)

In [9]:
et.fit(X_train, y_train)

ExtraTreesClassifier(n_estimators=200)

In [10]:
print(f'training score:  {et.score(X_train, y_train)}')

print(f'testing score: {et.score(X_test, y_test)}')

training score:  0.9923703703703703
testing score: 0.7236


In [None]:
# The results from both randomized models are pretty similar. Extra trees also results in an overfit model.
# I will not be selecting either of these as my production model.