In [22]:
import pandas as pd
import numpy as np

# SKlearn imports:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, f1_score, balanced_accuracy_score, RocCurveDisplay, recall_score, precision_score, recall_score, accuracy_score, ConfusionMatrixDisplay, roc_auc_score, precision_score, confusion_matrix
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import  MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [23]:
#read in data
df = pd.read_csv('./data/titles.csv')

# Logistic regression from the title

In [40]:
# get title as feature and subreddit as target (Chess or AnarchyChess)
X = df['title']
y = df['subreddit']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=123)

In [26]:
# using stop words
cvect_pipe = make_pipeline(CountVectorizer(stop_words = 'english', max_features = 1000),
                           LogisticRegression(max_iter=1000, penalty="none"))

cvect_pipe.fit(X_train,y_train)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=1000, stop_words='english')),
                ('logisticregression',
                 LogisticRegression(max_iter=1000, penalty='none'))])

In [27]:
cvect_pipe.score(X_train, y_train)

0.9866666666666667

In [28]:
cvect_pipe.score(X_test, y_test)

0.644

In [29]:
# pulling coefs out of pipe (these words help idenify which subreddit a title belongs to)
coefs = cvect_pipe['logisticregression'].coef_

word_coefs = pd.DataFrame({'words': cvect_pipe['countvectorizer'].get_feature_names_out(), 
              'coefs': coefs[0]})

word_coefs.sort_values(by='coefs', ascending=True).head(15)

Unnamed: 0,words,coefs
513,question,-549.780017
49,beautiful,-480.358397
316,missed,-474.93833
507,puzzles,-447.92601
69,books,-441.656092
838,titled,-429.512434
514,questions,-425.247453
271,learning,-413.705266
158,equal,-412.919584
537,ratings,-410.774303


In [30]:
word_coefs.sort_values(by='coefs', ascending=False).head(15)

Unnamed: 0,words,coefs
834,tinder,555.196679
20,according,552.16022
684,simulated,530.853516
203,going,463.07524
239,incredible,456.593394
305,mates,450.112569
798,talks,430.452816
615,run,428.422373
173,finally,421.143744
19,50,416.710574


# Logistic Regression with Ngram

In [31]:
# trying N-gram
cvect_pipe_ngram = make_pipeline(CountVectorizer(stop_words = 'english', max_features = 1000, ngram_range=(1,2)),
                           LogisticRegression(max_iter=1000, penalty="none"))

cvect_pipe_ngram.fit(X_train,y_train)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=1000, ngram_range=(1, 2),
                                 stop_words='english')),
                ('logisticregression',
                 LogisticRegression(max_iter=1000, penalty='none'))])

In [32]:
cvect_pipe_ngram.score(X_train, y_train)

0.968

In [33]:
cvect_pipe_ngram.score(X_test, y_test)

0.636

# Naive Bayes

In [34]:
cvect_pipe2 = make_pipeline(CountVectorizer(stop_words = 'english', max_features = 1000),
                           MultinomialNB())

cvect_pipe2.fit(X_train,y_train)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=1000, stop_words='english')),
                ('multinomialnb', MultinomialNB())])

In [35]:
cvect_pipe2.score(X_train, y_train)

0.88

In [36]:
cvect_pipe2.score(X_test, y_test)

0.632

# Random Forest

In [37]:
forest_pipe = make_pipeline(CountVectorizer(stop_words = 'english', max_features = 5000, ngram_range = (1,3)),
                           RandomForestClassifier(n_estimators = 1000, random_state = 123))

forest_pipe.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=5000, ngram_range=(1, 3),
                                 stop_words='english')),
                ('randomforestclassifier',
                 RandomForestClassifier(n_estimators=1000, random_state=123))])

In [38]:
forest_pipe.score(X_train, y_train)

0.9933333333333333

In [39]:
forest_pipe.score(X_test, y_test)

0.664