In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay,
    accuracy_score, roc_auc_score, recall_score,
    precision_score, f1_score, RocCurveDisplay)
from imblearn.metrics import specificity_score

from nltk.stem import WordNetLemmatizer
import re



from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor



import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv('../data/source.csv')
df.head(2)

In [None]:
df.subreddit.value_counts()

In [None]:
print(df.subreddit.value_counts())
df.subreddit.value_counts(normalize = True)

# Lemmatize

In [None]:
def lemmatize_text(text):
    return [lm.lemmatize(w) for w in w_tokenizer.tokenize(text)]

df = pd.DataFrame(['this was cheesy', 'she likes these books', 'wow this is great'], columns=['text'])
df['text_lemmatized'] = df.text.apply(lemmatize_text)

In [None]:

lm = WordNetLemmatizer()


def lemmatizer(series):
    for row in series:
        wordlist = []
        words = row.split()
        
        for i in words:
            wordlist.append(lm.lemmatize(i))
            
        df.copy()
        df.selftext[row] = wordlist

    
    



In [None]:
df['selftext'] = lemmatizer(df.selftext)

0       None
1       None
2       None
3       None
4       None
        ... 
3491    None
3492    None
3493    None
3494    None
3495    None
Name: selftext, Length: 3496, dtype: object

# Model

In [None]:
X1 = df.title
x2 = df.selftext
y = df.subreddit

In [None]:
y.value_counts(normalize=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X1,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

In [None]:
# Pipeline accepts multiple transformers, but only one vectorizer. See cgpt results:

'''The error message is caused by the fact that you are trying to fit two different vectorizers (TfidfVectorizer and CountVectorizer) 
in the same pipeline, but only providing one input (X_train) to the pipeline. This is causing the pipeline to raise an error as it doesn't 
know which vectorizer to apply to the input. 

You can fix this by either removing one of the vectorizers or providing separate inputs to the pipeline for each vectorizer.'''


nb_pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    #('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [None]:
nb_pipe_params = {
    'tvec__max_features': range(600,800,5),
    'tvec__stop_words': [None, 'english'],
    'tvec__ngram_range': [(1,1), (1,2), (1,3)],
    'nb__alpha': [.005, .01, .05, .1]
    #'cvec__max_features': range(2500, 3500, 100),
    #'cvec__min_df': range(1,5),
    #'cvec__max_df': [i/100 for i in range(0, 100, 5)],
    #'cvec__ngram_range': [(1,1),(1, 2), (1,3)]
}

In [None]:
# Create the model
level1_estimators = [
    ('nb', nb_pipe),
    ('knn_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('knn', KNeighborsRegressor())
    ])),
    ('bag', BaggingRegressor())
]

stacked_model = StackingRegressor(estimators = level1_estimators,
                                 final_estimator = LogisticRegression())

In [None]:
stacked_model = RandomizedSearchCV(stacked_model, 
                  nb_pipe_params, 
                    cv = 5) 

In [None]:
stacked_rs.fit(X_train, y_train)

In [None]:
print(stacked_rs.score(X_train, y_train))
stacked_rs.score(X_test,y_test)

In [None]:
rs = RandomizedSearchCV(nb_pipe, 
                  nb_pipe_params, 
                    cv = 5) 

In [None]:
rs.fit(X_train, y_train)

In [None]:
print(rs.score(X_train, y_train))
rs.score(X_test, y_test)

In [None]:
rs.best_params_

In [None]:
rs.score(X_test, y_test)

In [None]:
preds = rs.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, preds)

display = ConfusionMatrixDisplay(confusion_matrix = cm,
                             display_labels = rs.classes_)

display.plot();

In [None]:
# Basic ROC

# cf ROC
# create an ax object
ax = plt.gca()

# use RocCurveDisplay for both estimators
RocCurveDisplay.from_estimator(rs, X_test, y_test, ax=ax, name='logistic regression')

# add 'worst case scenario' line
plt.plot([0,1], [0,1], label='null hypothesis/mean', linestyle='--', color='gray')

# necessary to label the baseline
plt.legend();