In [137]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay,
    accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, RocCurveDisplay)
from imblearn.metrics import specificity_score

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re



from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier



import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [138]:
df_selftext = pd.read_csv('../data/cleaned_selftext.csv')
df_notext = pd.read_csv('../data/cleaned_notext.csv')
df_selftext.head(2)

Unnamed: 0,subreddit,text
0,0,using electric water boiler to mine bitcoin th...
1,0,btc to usdt hello i am looking to swap a good ...


In [139]:
print(df_selftext.subreddit.value_counts())
df_notext.subreddit.value_counts()

0    4017
1     787
Name: subreddit, dtype: int64


0    9938
1    2059
Name: subreddit, dtype: int64

# Adding stop words to not make it too easy

In [140]:
stop = list(stopwords.words('english'))
stop.extend('btc eth bitcoin ethereum lightning vitalik metamask nft nfts'.split())


# TTS

In [141]:
X = df_selftext.text
y = df_selftext.subreddit

In [142]:
# baseline
y.value_counts(normalize=True)

0    0.836178
1    0.163822
Name: subreddit, dtype: float64

In [143]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

# Model 1: NB

In [144]:
# Pipeline accepts multiple transformers, but only one vectorizer. See cgpt results:

'''The error message is caused by the fact that you are trying to fit two different vectorizers (TfidfVectorizer and CountVectorizer) 
in the same pipeline, but only providing one input (X_train) to the pipeline. This is causing the pipeline to raise an error as it doesn't 
know which vectorizer to apply to the input. 

You can fix this by either removing one of the vectorizers or providing separate inputs to the pipeline for each vectorizer.'''


nb_pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

In [145]:
nb_pipe_params = {
    'tvec__max_features': range(600,800,5),
    'tvec__stop_words': [stop],
    'tvec__ngram_range': [(1,1), (1,2), (1,3),(2,3),(3,3)],
    'nb__alpha': [.01, .05, .1, .25, .5, .1]
}

In [146]:
nb_rs = RandomizedSearchCV(nb_pipe, 
                  nb_pipe_params, 
                    cv = 5) 

In [147]:
nb_rs.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                             ('nb', MultinomialNB())]),
                   param_distributions={'nb__alpha': [0.01, 0.05, 0.1, 0.25,
                                                      0.5, 0.1],
                                        'tvec__max_features': range(600, 800, 5),
                                        'tvec__ngram_range': [(1, 1), (1, 2),
                                                              (1, 3), (2, 3),
                                                              (3, 3)],
                                        'tvec__stop_words': [['i', 'me', 'my',
                                                              'myself', 'we',
                                                              'our', 'ours',
                                                              'ourselves',
                                                              'you', 

# Model 2: LR

In [148]:
logr_pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('logr', LogisticRegression())
])

In [149]:
logr_pipe_params = {
    'tvec__max_features': range(600,800,5),
    'tvec__stop_words': [stop],
    'tvec__ngram_range': [(1,1), (1,2), (1,3),(2,3),(3,3)],
    'logr__C': [.01, .1 ,.5 , 1.0, 2, 5, 10]
}

In [150]:
logr_rs = RandomizedSearchCV(logr_pipe, 
                  logr_pipe_params, 
                    cv = 5) 

In [None]:
logr_rs.fit(X_train, y_train)

In [129]:
print(logr_rs.score(X_train, y_train))
logr_rs.score(X_test, y_test)

0.9042464612822648


0.8834304746044963

# Model 3: knn

In [None]:
# mean = False here for memory error. see individual model for model with mean = True

knn = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('ss', StandardScaler(with_mean=False)),
    ('knn', KNeighborsClassifier())
])

In [None]:
knn_params = {
    'tvec__max_features': range(100,1000,100),
    'tvec__stop_words': [stop],
    'tvec__ngram_range': [(1,1), (1,2), (1,3),(2,3),(3,3)],
    'knn__n_neighbors': range(1,20)
    
}

In [None]:
knn_rs = RandomizedSearchCV(knn, knn_params, cv = 4)

knn_rs.fit(X_train, y_train)

In [None]:
print(knn_rs.score(X_train, y_train))
knn_rs.score(X_test, y_test)

# Model 4: RF Boosted

In [134]:
ada = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('ada', AdaBoostClassifier(base_estimator = RandomForestClassifier()))
])

In [135]:
ada_params = {
    'tvec__max_features': range(650,750,5),
    'tvec__stop_words': [stop],
    'tvec__ngram_range': [(1,1), (1,2), (1,3),(2,3),(3,3)],
    'ada__n_estimators': range(100,200, 5)
}

In [136]:
ada_rf_rs = RandomizedSearchCV(ada, ada_params, cv = 4)

ada_rf_rs.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
print(ada_rf_rs.score(X_train, y_train))
ada_rf_rs.score(X_test, y_test)

# Model 5: Gradient boosting DT

In [None]:
gb_pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('gb', GradientBoostingClassifier())
])

In [None]:
gb_pipe_params = {
    'tvec__max_features': range(600,800,5),
    'tvec__stop_words': [stop],
    'tvec__ngram_range': [(1,1), (1,2), (1,3),(2,3),(3,3)],
    'gb__n_estimators': range(50,200,50),
    'gb__max_features': range(50, 80)
}

In [None]:
gb_rs = RandomizedSearchCV(gb_pipe, 
                  gb_pipe_params, 
                    cv = 5) 

In [None]:
gb_rs.fit(X_train, y_train)

In [None]:
print(gb_rs.score(X_train, y_train))
gb_rs.score(X_test, y_test)

# Model 6: Stack

In [None]:
stack_estimators = [
    ('nb_pipe', nb_rs.best_estimator_),
    ('ada_pipe', ada_rf_rs.best_estimator_),
    ('gb_pipe', gb_rs.best_estimator_),
    ('logr_pipe', logr_rs.best_estimator_)   
]

stack = StackingClassifier(estimators=stack_estimators, final_estimator=LogisticRegression())

cross_val_score(stack, X_train, y_train).mean()


In [None]:
stack.fit(X_train, y_train)

In [None]:
print(stack.score(X_train, y_train))
stack.score(X_test, y_test)

# Make list of models

In [None]:
models = [nb_rs, logr_rs, knn_rs, ada_rf_rs, gb_rs, stack]
modelsstr =  ['nb_rs', 'logr_rs', 'knn_rs', 'ada_rf_rs', 'gb_rs', 'stack']

# Generate predictions for models

In [None]:
def preds_gen(estimator):
    return estimator.predict(X_test)

preds_array = []

for i in models:
    preds_array.append(preds_gen(i))

In [None]:
def confusion_matrices(model, preds):
    
    cm = confusion_matrix(y_test, preds)

    display = ConfusionMatrixDisplay(confusion_matrix = cm,
                                 display_labels = model.classes_)
    display.plot();



# Generate confusion matrices

In [None]:
for i in preds_array:
    tn, fp, fn, tp = confusion_matrix(y_test, i).ravel()
    print(tn, fp, fn, tp)

In [None]:
d = []
for i in preds_array:
    tn, fp, fn, tp = confusion_matrix(y_test, i).ravel()
    
    d.append(
        {
            'tn': tn,
            'fp': fp,
            'fn': fn,
            'tp': tp
        }
    )

cf_df = pd.DataFrame(d)
cf_df['model'] = pd.Series(modelsstr)


In [None]:
cf_df['sensitivity'] = (cf_df['tp'] / (cf_df['tp'] + cf_df['fn']))*100
cf_df['specificity'] = (cf_df['tn'] / (cf_df['tn'] + cf_df['fp']))*100
cf_df['precision'] = (cf_df['tp'] / (cf_df['tp'] + cf_df['fp']))*100
cf_df['neg_predicitve_val'] = (cf_df['tn'] / (cf_df['tn'] + cf_df['fn']))*100
cf_df['accuracy'] = ((cf_df['tp'] + cf_df['tn']) / (cf_df['tp'] + cf_df['tn'] + cf_df['fp'] + cf_df['fn']))*100
cf_df['f1'] = (2*cf_df['tp'] /  (2*cf_df['tp'] + cf_df['fp'] + cf_df['fn']))*100

In [None]:
cf_df['test_score'] = [i.score(X_test, y_test) for i in models]
cf_df['fit'] = [((i.score(X_test, y_test) - i.score(X_train, y_train))/i.score(X_test, y_test))*100 for i in models]

In [None]:
cf_df

In [None]:
cols = [i for i in cf_df.columns if i not in ['tn', 'tp', 'fn', 'fp', 'model']]

In [None]:
cf_df[cols] = cf_df[cols].apply(lambda x: pd.Series.round(x, 2))
cf_df

In [None]:
cf_df[['model', 'sensitivity', 'fit', 'accuracy']].sort_values(by = 'accuracy', ascending = False).plot(x='model', kind='bar')
plt.title('key model performance metrics')
plt.tight_layout();
plt.savefig('../images/modelperf.png')

In [None]:
cf_df[['model', 'f1', 'fp', 'fn']].sort_values(by = 'f1', ascending = False).plot(x='model', kind='bar')
plt.title('model F1 and misclassification')
plt.tight_layout();
plt.savefig('../images/misclassification.png')

In [None]:
cf_df.set_index('model').sort_values(by = 'f1', ascending = False)

In [None]:
count = 0
for i,z in zip(models, preds_array):
    
    title = modelsstr[count]
    
    cm = confusion_matrix(y_test, z)
    
    disp = ConfusionMatrixDisplay.from_estimator(
        i,
        X_test,
        y_test,
        display_labels=i.classes_,
        cmap=plt.cm.Blues,
    )
    disp.ax_.set_title(title)
    count+=1
    print(disp.confusion_matrix)

In [None]:
#save for pres

cm = confusion_matrix(y_test, preds_array[-1])

disp = ConfusionMatrixDisplay.from_estimator(
    stack,
    X_test,
    y_test,
    display_labels=stack.classes_,
    cmap=plt.cm.Blues,
)
disp.ax_.set_title(title)
plt.savefig('../images/stack_confusion')
print(disp.confusion_matrix)

# Plot some ROCs

In [None]:
# notes
def ROC(model, name):

    ax = plt.gca()

    # use RocCurveDisplay for both estimators
    RocCurveDisplay.from_estimator(model, X_test, y_test, ax=ax, name=name)

    # add 'worst case scenario' line
    #plt.plot([0,1], [0,1], label='null hypothesis/mean', linestyle='--', color='gray')

    # necessary to label the baseline
    plt.legend();

In [None]:
for i in zip(models, modelsstr):
    print(ROC(*i))
    #plt.show();
    #plt.savefig('../images/ROC')

# Preds analysis

In [None]:
def pred_dfs(model):

    pred_df = pd.DataFrame(model.predict_proba(X_test),columns=['bitcoin', 'ethereum'])

    pred_df['true_values'] = y_test.values

    return pred_df

In [None]:
nb_preds, logr_preds, knn_preds, ada_preds, gb_preds, stack_preds = [pred_dfs(i) for i in models]

pred_arrays = [nb_preds, logr_preds, knn_preds, ada_preds, gb_preds, stack_preds]

In [None]:
#notes
def class_from_prob(probabilities, threshold):

    return [0 if prob < threshold else 1 for prob in probabilities]


In [None]:
def gen_tables(models, modelsstr):
    counter = 0
    for i in pred_arrays:
        
        threshold_list = [round(i*.01,2) for i in range(0,101,)]
        speclist = []
        senslist = []
        f1list = []

        for threshold in threshold_list:
            predicted_classes = class_from_prob(i['ethereum'], threshold)
            spec = specificity_score(y_test, predicted_classes)
            sens = recall_score(y_test, predicted_classes, pos_label=1)
            speclist.append(spec)
            senslist.append(sens)
            F1 = f1_score(y_test, predicted_classes)
            f1list.append(F1)
        
        
        %matplotlib inline
        fig = plt.figure(figsize = (10,5));
        ax1 = fig.add_subplot(111);

        ax1.scatter(x = threshold_list, y = speclist, s=10, c='b', marker="s", label='specificity');
        ax1.scatter(x = threshold_list,y = senslist, s=10, c='r', marker="o", label='sensitivity');
        ax1.scatter(x = threshold_list,y = f1list, s=10, c='g', marker="o", label='F1 score');
        plt.legend(loc='upper left')
        plt.xlabel('Threshold');
        plt.ylabel('Score Value');
        plt.title(f'{modelsstr[counter]}')
        counter +=1 
        plt.show();
        plt.tight_layout();

In [None]:
gen_tables(models, modelsstr)

# More details (Stack estimator)

In [None]:
pred_df = pd.DataFrame(stack.predict_proba(X_test),columns=['bitcoin', 'ethereum'])

pred_df['true_values'] = y_test.values

pred_df.head()


In [None]:
df_selftext = df_selftext.join(pred_df)

In [None]:
df_selftext.sort_values(by = 'bitcoin')[0:100]

In [None]:
threshold_list = [round(i*.01,2) for i in range(0,101,)]
speclist = []
senslist = []
f1list = []

for threshold in threshold_list:
    predicted_classes = class_from_prob(pred_df['ethereum'], threshold)
    spec = specificity_score(y_test, predicted_classes)
    sens = recall_score(y_test, predicted_classes, pos_label=1)
    speclist.append(spec)
    senslist.append(sens)
    F1 = f1_score(y_test, predicted_classes)
    f1list.append(F1)
    
    
%matplotlib inline
fig = plt.figure(figsize = (10,5));
ax1 = fig.add_subplot(111);

ax1.scatter(x = threshold_list, y = speclist, s=10, c='b', marker="s", label='specificity');
ax1.scatter(x = threshold_list,y = senslist, s=10, c='r', marker="o", label='sensitivity');
ax1.scatter(x = threshold_list,y = f1list, s=10, c='g', marker="o", label='F1 score');
plt.legend(loc='upper left')
plt.xlabel('Threshold');
plt.ylabel('Score Value');
plt.tight_layout();
plt.savefig('../images/threshold')

# False negatives

In [None]:
df_selftext[(df_selftext.true_values == 1) & (df_selftext.bitcoin > .5)].sort_values(by = 'bitcoin', ascending = False)

In [None]:
# uncomment for df
for i in df_selftext[(df_selftext.true_values == 1) & (df_selftext.bitcoin > .5)]['text']:
    print(i, '\n')

# False Positives

In [None]:
FP = pred_df.loc[(pred_df['ethereum'] > .5) & (pred_df.true_values == 0)].index.values.astype(int)

In [None]:
for i in df_selftext.iloc[FP].text:
    print(i,'\n')