In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.naive_bayes import MultinomialNB

In [4]:
df = pd.read_csv('data/data_cleaned.csv')

In [5]:
df.head()

Unnamed: 0,title,text,auth,time,subreddit,new_line_chars,space_chars,title_words,text_words
0,the moths of time,the moths of time consume your image; everythi...,lizerdqweenchlo,1643240027,OCPoetry,22,134,4,138
1,Haunted Houses,Floorboards creak Under little kid feet As a n...,richardcrack,1643238939,OCPoetry,164,253,2,334
2,Forest of Eden,&amp;#x200B; He could never quite find What ma...,mgmgmgmgm,1643237039,OCPoetry,42,69,3,79
3,The deepest fluctuation of creativity,With due regard at your behest I'll smear the ...,puredreadful,1643234027,OCPoetry,16,34,5,41
4,A Heart Divided,"If I were two instead of just one, I could mak...",robbsmith711,1643233772,OCPoetry,21,328,3,337


### Text only models

#### Model 1
* Text only
* Tfidf vectorizer
* logistic regression

In [6]:
X = df['text']
y = df['subreddit']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [8]:
pipe1 = make_pipeline(
    TfidfVectorizer(max_features = 1000),
    LogisticRegression(max_iter = 10_000, solver = 'saga', warm_start = True)
)

In [9]:
def run_model(model):
    model.fit(X_train, y_train)
    print(f"Training score: {model.score(X_train, y_train):.3f}")
    print(f"Testing score: {model.score(X_test, y_test):.3f}")

In [10]:
def top_coefs_lr(lr_pipe):
    lr_coefs = lr_pipe.named_steps['logisticregression'].coef_
    features = lr_pipe.named_steps['tfidfvectorizer'].get_feature_names_out()
    lr_coefs_series = pd.Series(lr_coefs[0], index = features)

    print("=== Top features for OCPoetry ===")
    print(lr_coefs_series.sort_values().head(10))

    print("=== Top features for shortscarystories ===")
    print(lr_coefs_series.sort_values().tail(10))

In [11]:
run_model(pipe1)

Training score: 0.928
Testing score: 0.918


In [12]:
top_coefs_lr(pipe1)

=== Top features for OCPoetry ===
feedback   -11.724951
poem        -9.241269
poetry      -6.097608
wrote       -4.341873
heart       -4.295404
tears       -3.688988
like        -3.624248
yet         -3.623374
say         -3.468405
sun         -3.282752
dtype: float64
=== Top features for shortscarystories ===
them           3.639391
killed         3.701553
immediately    3.708712
any            3.781644
started        3.811578
people         4.036952
stories        4.070552
had            4.167798
was            5.121245
horror         5.253520
dtype: float64


We should add some stop words that make this task too easy by giving away the answer directly: poem / poetry and horror.

In [13]:
stop_words = ['poem', 'poetry', 'horror']

In [14]:
pipe2 = make_pipeline(
    TfidfVectorizer(max_features = 1000, stop_words=stop_words),
    LogisticRegression(max_iter = 10_000, solver = 'saga', warm_start = True)
)

In [15]:
run_model(pipe2)

Training score: 0.926
Testing score: 0.916


In [16]:
top_coefs_lr(pipe2)

=== Top features for OCPoetry ===
feedback   -12.234946
wrote       -5.398393
heart       -4.329347
tears       -3.743638
like        -3.696180
yet         -3.572906
say         -3.449298
sun         -3.276940
love        -3.223296
words       -3.192076
dtype: float64
=== Top features for shortscarystories ===
killed         3.580922
any            3.679735
them           3.702238
door           3.726862
immediately    3.747294
started        3.830573
people         4.005491
had            4.213359
stories        4.629439
was            5.195165
dtype: float64


In [17]:
stop_words.append('feedback')

In [18]:
pipe3 = make_pipeline(
    TfidfVectorizer(max_features = 1000, stop_words=stop_words),
    LogisticRegression(max_iter = 10_000, solver = 'saga', warm_start = True)
)

In [19]:
run_model(pipe3)

Training score: 0.923
Testing score: 0.913


In [20]:
top_coefs_lr(pipe3)

=== Top features for OCPoetry ===
wrote   -5.860394
heart   -4.436735
tears   -3.775466
like    -3.746685
yet     -3.617657
love    -3.402516
say     -3.398942
sun     -3.363530
words   -3.147683
mine    -3.056418
dtype: float64
=== Top features for shortscarystories ===
police         3.615472
going          3.629421
door           3.762062
started        3.793882
immediately    3.813340
them           3.832777
people         4.085598
had            4.333543
stories        4.599836
was            5.200013
dtype: float64


#### Naive Bayes Model

In [21]:
pipe4 = make_pipeline(
    TfidfVectorizer(max_features = 1000, stop_words=stop_words),
    MultinomialNB()
)

In [22]:
run_model(pipe4)

Training score: 0.873
Testing score: 0.871


In [32]:
pipe4.named_steps['multinomialnb'].classes_

array(['OCPoetry', 'shortscarystories'], dtype='<U17')

In [37]:
def top_features_nb(nb_pipe):
    nb_log_odds = nb_pipe.named_steps['multinomialnb'].feature_log_prob_
    features = nb_pipe.named_steps['tfidfvectorizer'].get_feature_names_out()
    classes = nb_pipe.named_steps['multinomialnb'].classes_
    nb_log_odds_df = pd.DataFrame(nb_log_odds.T, index = features, columns = classes)

    print(nb_log_odds_df.head())
    print("=== Top features for OCPoetry ===")
    print(nb_log_odds_df.sort_values().head(10))

    # print("=== Top features for shortscarystories ===")
    # print(nb_log_odds_series.sort_values().tail(10))

In [38]:
top_features_nb(pipe4)

        OCPoetry  shortscarystories
10     -8.457721          -7.583030
able   -8.135377          -7.430771
about  -6.287562          -5.902122
above  -7.152572          -7.580400
across -7.208203          -7.111546


In [39]:
from sklearn import __version__
__version__

'1.0.1'

New feature ideas
* Whitespace
* Part of speech (adjective, verb)
* sentiment analysis

In [None]:
###### Can add sentiment analysis as feature.