In [87]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import ConfusionMatrixDisplay

In [88]:
df = pd.read_csv('data/data_cleaned.csv')

In [89]:
df.head()

Unnamed: 0,title,text,auth,time,subreddit,new_line_chars,space_chars,title_words,text_words
0,the moths of time,the moths of time consume your image; everythi...,lizerdqweenchlo,1643240027,OCPoetry,22,134,4,138
1,Haunted Houses,Floorboards creak Under little kid feet As a n...,richardcrack,1643238939,OCPoetry,164,253,2,334
2,Forest of Eden,&amp;#x200B; He could never quite find What ma...,mgmgmgmgm,1643237039,OCPoetry,42,69,3,79
3,The deepest fluctuation of creativity,With due regard at your behest I'll smear the ...,puredreadful,1643234027,OCPoetry,16,34,5,41
4,A Heart Divided,"If I were two instead of just one, I could mak...",robbsmith711,1643233772,OCPoetry,21,328,3,337


### Model 1

* Text only
* Tfidf vectorizer
* logistic regression

In [90]:
X = df['text']
y = df['subreddit']

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [92]:
pipe1 = make_pipeline(
    TfidfVectorizer(max_features = 1000),
    LogisticRegression(max_iter = 10_000, solver = 'saga', warm_start = True)
)

In [104]:
def run_model(model):
    model.fit(X_train, y_train)
    print(f"Training score: {model.score(X_train, y_train):.3f}")
    print(f"Testing score: {model.score(X_test, y_test):.3f}")

In [105]:
def top_coefs_lr(lr_pipe):
    lr_coefs = lr_pipe.named_steps['logisticregression'].coef_
    features = lr_pipe.named_steps['tfidfvectorizer'].get_feature_names_out()
    lr_coefs_series = pd.Series(lr_coefs[0], index = features)

    print("=== Top features for OCPoetry ===")
    print(lr_coefs_series.sort_values().head(10))

    print("=== Top features for shortscarystories ===")
    print(lr_coefs_series.sort_values().tail(10))

In [106]:
run_model(pipe1)

Training score: 0.928
Testing score: 0.922


In [107]:
top_coefs_lr(pipe1)

=== Top features for OCPoetry ===
feedback   -11.735772
poem        -9.142683
poetry      -6.569022
heart       -4.942911
like        -3.860337
wrote       -3.826875
tears       -3.576072
sun         -3.557608
yet         -3.486862
hold        -3.463704
dtype: float64
=== Top features for shortscarystories ===
door           3.847913
humans         3.869404
started        3.948045
stories        4.101870
immediately    4.147857
any            4.312675
going          4.421943
had            4.426253
was            4.836823
horror         5.106104
dtype: float64


We should add some stop words that make this task too easy by giving away the answer directly: poem / poetry and horror.

In [112]:
stop_words = ['poem', 'poetry', 'horror']

In [113]:
pipe2 = make_pipeline(
    TfidfVectorizer(max_features = 1000, stop_words=stop_words),
    LogisticRegression(max_iter = 10_000, solver = 'saga', warm_start = True)
)

In [114]:
run_model(pipe2)

Training score: 0.925
Testing score: 0.919


In [115]:
top_coefs_lr(pipe2)

=== Top features for OCPoetry ===
feedback   -12.283781
wrote       -4.954768
heart       -4.947145
like        -3.966319
tears       -3.609974
sun         -3.542323
yet         -3.510190
hold        -3.432957
lost        -3.144258
lies        -3.104025
dtype: float64
=== Top features for shortscarystories ===
few            3.780036
humans         3.867178
door           3.945334
started        3.952303
immediately    4.162948
any            4.225543
going          4.476883
had            4.478135
stories        4.570350
was            4.896199
dtype: float64


In [116]:
stop_words.append('feedback')

In [117]:
pipe3 = make_pipeline(
    TfidfVectorizer(max_features = 1000, stop_words=stop_words),
    LogisticRegression(max_iter = 10_000, solver = 'saga', warm_start = True)
)

In [118]:
run_model(pipe3)

Training score: 0.922
Testing score: 0.917


In [119]:
top_coefs_lr(pipe3)

=== Top features for OCPoetry ===
wrote      -5.390857
heart      -5.061636
like       -3.998035
tears      -3.640115
sun        -3.627781
yet        -3.614490
hold       -3.327616
lost       -3.180938
thoughts   -3.179826
lies       -3.143316
dtype: float64
=== Top features for shortscarystories ===
few            3.794451
people         3.808744
started        3.819131
humans         4.005990
door           4.064819
immediately    4.235897
had            4.541562
going          4.559610
stories        4.688474
was            4.918857
dtype: float64
