In [1]:
# imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from scipy.stats import uniform, loguniform
from sklearn.preprocessing import StandardScaler

In [24]:
#read in cleaned data

df = pd.read_csv('../Data/cleaned.csv')

In [25]:
df.head()

Unnamed: 0,title,subreddit
0,women of reddit when you are in a relationship...,1
1,do you have a hot take what is it,1
2,why do you cry so much over little things,1
3,as an adult how do you make more female friends,1
4,which 3rd party reddit app do you use and what...,1


In [26]:
df.shape

(90000, 2)

In [5]:
#baseline score

df['subreddit'].value_counts(normalize=True)

0    0.5
1    0.5
Name: subreddit, dtype: float64

In [6]:
#set variables for modeling

X = df['title']

y = df['subreddit']

In [7]:
#train test split for model evaluation

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [42]:
#build pipeline for random gridsearch

pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('logreg', LogisticRegression(solver = 'saga', max_iter=2000))
])

In [43]:
pipe_params = {
    'cvec__max_features':[7_000, 10_000],
    'cvec__stop_words'  :[None, 'english'],
    'cvec__ngram_range' :[(1,1), (1,2)],
    'logreg__C'         : (uniform(.01, 1)),
    'logreg__penalty'   :['none', 'l1','l2']
}



In [44]:
rs = RandomizedSearchCV(estimator = pipe,
                     param_distributions = pipe_params,
                     n_jobs = -1,
                     cv = 5)

In [45]:
rs.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                             ('logreg',
                                              LogisticRegression(max_iter=2000,
                                                                 solver='saga'))]),
                   n_jobs=-1,
                   param_distributions={'cvec__max_features': [7000, 10000],
                                        'cvec__ngram_range': [(1, 1), (1, 2)],
                                        'cvec__stop_words': [None, 'english'],
                                        'logreg__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000015D3E057A00>,
                                        'logreg__penalty': ['none', 'l1',
                                                            'l2']})

In [46]:
print(f'training score:  {rs.score(X_train, y_train)}')

print(f'testing score: {rs.score(X_test, y_test)}')

training score:  0.8069481481481482
testing score: 0.7277333333333333


In [47]:
rs.best_params_

{'cvec__max_features': 10000,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None,
 'logreg__C': 0.9075883331227518,
 'logreg__penalty': 'l2'}

In [None]:
# This shows the regression model is very overfit on my training data and not performing as well on unseen data. 
# It is doing better than the baseline, but I will select another model for production.
# It's interesting to see the best paramerters selected in the grid search- I did not expect no stop words to do the best.