In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay,
    accuracy_score, roc_auc_score, recall_score,
    precision_score, f1_score, RocCurveDisplay)
from imblearn.metrics import specificity_score

from nltk.stem import WordNetLemmatizer
import re



from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor



import matplotlib.pyplot as plt


In [4]:
df = pd.read_csv('../data/source.csv')
df.head(2)

Unnamed: 0,subreddit,selftext,title,upvote_ratio,score,id,author,num_comments,created_utc,retrieved_utc,updated_utc,post_date,post_time
0,bitcoin,you're only going to get hurt when this ends,i mean this in the kindest way possible can ev...,1.0,1,10bvjbg,rolo951,0,2023-01-14 18:29:08,1673720964,1673720964,2023-01-14,18:29:08
1,bitcoin,what was your fuck it i'm into this moment for...,all in,1.0,1,10bur6i,soliton-gaydar,0,2023-01-14 17:58:04,1673719099,1673719099,2023-01-14,17:58:04


In [5]:
df.subreddit.value_counts()

bitcoin     2807
ethereum     689
Name: subreddit, dtype: int64

In [6]:
print(df.subreddit.value_counts())
df.subreddit.value_counts(normalize = True)

bitcoin     2807
ethereum     689
Name: subreddit, dtype: int64


bitcoin     0.802918
ethereum    0.197082
Name: subreddit, dtype: float64

# Lemmatize

In [7]:
lm = WordNetLemmatizer()

In [2]:

lm = WordNetLemmatizer()


def lemmatize(series):
    for i in series:
        wordlist = []
        words = i.split()
        
        for k in words:
            wordlist.append(lm.lemmatize(k))
    return (wordlist)
    
lems = lemmatize(df['selftext'])
lems

NameError: name 'WordNetLemmatizer' is not defined

# Model

In [403]:
X1 = df.title
x2 = df.selftext
y = df.subreddit

In [404]:
y.value_counts(normalize=True)

1    0.799841
0    0.200159
Name: subreddit, dtype: float64

In [405]:
X_train, X_test, y_train, y_test = train_test_split(X1,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

In [406]:
# Pipeline accepts multiple transformers, but only one vectorizer. See cgpt results:

'''The error message is caused by the fact that you are trying to fit two different vectorizers (TfidfVectorizer and CountVectorizer) 
in the same pipeline, but only providing one input (X_train) to the pipeline. This is causing the pipeline to raise an error as it doesn't 
know which vectorizer to apply to the input. 

You can fix this by either removing one of the vectorizers or providing separate inputs to the pipeline for each vectorizer.'''


nb_pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    #('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [407]:
nb_pipe_params = {
    'tvec__max_features': range(600,800,5),
    'tvec__stop_words': [None, 'english'],
    'tvec__ngram_range': [(1,1), (1,2), (1,3)],
    'nb__alpha': [.005, .01, .05, .1]
    #'cvec__max_features': range(2500, 3500, 100),
    #'cvec__min_df': range(1,5),
    #'cvec__max_df': [i/100 for i in range(0, 100, 5)],
    #'cvec__ngram_range': [(1,1),(1, 2), (1,3)]
}

In [408]:
# Create the model
level1_estimators = [
    ('nb', nb_pipe),
    ('knn_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('knn', KNeighborsRegressor())
    ])),
    ('bag', BaggingRegressor())
]

stacked_model = StackingRegressor(estimators = level1_estimators,
                                 final_estimator = LogisticRegression())

In [409]:
stacked_model = RandomizedSearchCV(stacked_model, 
                  nb_pipe_params, 
                    cv = 5) 

In [410]:
stacked_rs.fit(X_train, y_train)

ValueError: Invalid parameter tvec for estimator StackingRegressor(estimators=[('nb',
                               Pipeline(steps=[('tvec', TfidfVectorizer()),
                                               ('nb', MultinomialNB())])),
                              ('knn_pipe',
                               Pipeline(steps=[('ss', StandardScaler()),
                                               ('knn',
                                                KNeighborsRegressor())])),
                              ('bag', BaggingRegressor())],
                  final_estimator=LogisticRegression()). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
print(stacked_rs.score(X_train, y_train))
stacked_rs.score(X_test,y_test)

In [None]:
rs = RandomizedSearchCV(nb_pipe, 
                  nb_pipe_params, 
                    cv = 5) 

In [None]:
rs.fit(X_train, y_train)

In [None]:
print(rs.score(X_train, y_train))
rs.score(X_test, y_test)

In [None]:
rs.best_params_

In [None]:
rs.score(X_test, y_test)

In [None]:
preds = rs.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, preds)

display = ConfusionMatrixDisplay(confusion_matrix = cm,
                             display_labels = rs.classes_)

display.plot();

In [None]:
# Basic ROC

# cf ROC
# create an ax object
ax = plt.gca()

# use RocCurveDisplay for both estimators
RocCurveDisplay.from_estimator(rs, X_test, y_test, ax=ax, name='logistic regression')

# add 'worst case scenario' line
plt.plot([0,1], [0,1], label='null hypothesis/mean', linestyle='--', color='gray')

# necessary to label the baseline
plt.legend();