In [None]:
# Load the dataset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
spam = pd.read_csv("../input/spam-email/spam.csv")

In [None]:
# Exploramos los datos
spam.head()

In [None]:
recuento = spam['Category'].value_counts()
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.barh(recuento.index, recuento.values, color = ['blue', 'orange'])
plt.subplot(1,2,2)
plt.pie(recuento.values, labels = recuento.index, autopct = '%.2f%%')
plt.show()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(spam, spam['Category']):
    print(train_index, test_index)
    strat_train_set = spam.iloc[train_index]
    strat_test_set = spam.iloc[test_index]

In [None]:
pct_test = strat_test_set['Category'].value_counts() / len(strat_test_set)
pct_train = strat_train_set['Category'].value_counts() / len(strat_train_set)
pct_total = spam['Category'].value_counts() / len(spam)
spam_pct = [list(pct_total.values)[1]] + [list(pct_train.values)[1]] + [list(pct_test.values)[1]]
ham_pct = [list(pct_total.values)[0]] + [list(pct_train.values)[0]] + [list(pct_test.values)[0]]
cat = ['spam', 'ham']
tipo = ['total', 'train', 'test']
dic = {}
dic['spam/ham'] = ['spam']*3 + ['ham']*3
dic['type'] = tipo * 2
dic['pct'] = spam_pct + ham_pct
df = pd.DataFrame(dic)


import seaborn as sns
plt.figure(figsize=(10, 6))
sns.barplot(x = 'type', hue = 'spam/ham', y = 'pct', data = df)
plt.show()

In [None]:
#Preparing data for algorithms
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

pipeline_text = Pipeline([('text_extraction', CountVectorizer(max_features = 5000))])
full_pipeline = ColumnTransformer([('encoder', OrdinalEncoder(), ['Category']), ('text_extraction', pipeline_text, 'Message')])

In [None]:
#Preparing train_set
train_set_prepared = full_pipeline.fit_transform(strat_train_set)

In [None]:
train_set_prepared_labels = train_set_prepared[:,0].copy()
train_set_prepared_predictors = train_set_prepared[:,1:].copy()
len(train_set_prepared_labels.toarray()), len(train_set_prepared_predictors.toarray())

In [None]:
# Evaluating different models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

forest_clf = RandomForestClassifier()
sgd_clf = SGDClassifier()
svc_clf = SVC()
kn_clf = KNeighborsClassifier()
nb_clf = GaussianNB()
lg_clf = LogisticRegression()

In [None]:
from sklearn.model_selection import cross_val_score

models = ['lg_clf', 'nb_clf','forest_clf','sgd_clf','svc_clf','kn_clf']
dic_models_scores = {}

for model in models:
    scores = cross_val_score(eval(model), train_set_prepared_predictors.toarray(), train_set_prepared_labels.toarray().ravel(),
                             cv = 5, scoring = "accuracy")

    dic_models_scores[model] = list(scores)
    dic_models_scores[f'{model}_means'] = scores.mean()
    dic_models_scores[f'{model}_std'] = scores.std()

In [None]:
dic_models_scores

In [None]:
plt.figure(figsize=(8,5))
plt.title('Precisión de los modelos')
plt.grid(axis = 'y')
plt.yticks([0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 1.0])
sns.barplot(models, [dic_models_scores['lg_clf_means'], dic_models_scores['nb_clf_means'], dic_models_scores['forest_clf_means'],
                 dic_models_scores['sgd_clf_means'], dic_models_scores['svc_clf_means'], dic_models_scores['kn_clf_means']])

In [None]:
plt.figure(figsize=(10,5))
for model in models:
    plt.plot(range(1,6), dic_models_scores[model], label = model)
    
plt.title('5 cross-validation with 6 models')
plt.legend()
plt.show()

In [None]:
# Let's look some classifiers
words = full_pipeline.named_transformers_['text_extraction']['text_extraction'].get_feature_names()
voc = full_pipeline.named_transformers_['text_extraction']['text_extraction'].vocabulary_
full_pipeline.named_transformers_['encoder'].categories_
len(words), train_set_prepared_predictors.toarray().shape[1] # Words == Columns

In [None]:
# Fine-tune SGD
from sklearn.model_selection import GridSearchCV

param_grid = [{'penalty' : ['l2', 'l1', 'elasticnet'],
               'loss' : ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], 'max_iter': [2000]}]

grid_search = GridSearchCV(sgd_clf, param_grid, cv = 5, scoring = "accuracy")
grid_search.fit(train_set_prepared_predictors.toarray(), train_set_prepared_labels.toarray().ravel())

In [None]:
# Fine-tune lg
from sklearn.model_selection import GridSearchCV

param_grid2 = [{'C' : [0.8, 1, 1.2],
               'solver': ['lbfgs', 'liblinear'],
               'max_iter': [200]}]

grid_search2 = GridSearchCV(lg_clf, param_grid2, cv = 5, scoring = "accuracy")
grid_search2.fit(train_set_prepared_predictors.toarray(), train_set_prepared_labels.toarray().ravel())

In [None]:
# Fine-tune random forest
from sklearn.model_selection import GridSearchCV

param_grid3 = [{'n_estimators' : [95,100,150], # much time 
                'criterion' : ['gini', 'entropy'],
                'max_features' : ['auto', 'sqrt', 'log2']}]
                

grid_search3 = GridSearchCV(forest_clf, param_grid3, cv = 5, scoring = "accuracy")
grid_search3.fit(train_set_prepared_predictors.toarray(), train_set_prepared_labels.toarray().ravel())

In [None]:
#Wich words are more importante in the model for predictions
import sys
!{sys.executable} -m pip install wordcloud
from wordcloud import WordCloud
feature_importance = sorted(zip(grid_search3.best_estimator_.feature_importances_, words), reverse = True)

In [None]:
new_dict = {}
for k, v in dict(feature_importance).items():
    new_dict[v] = k
    
word_cloud = WordCloud(collocations = False, background_color = 'white').fit_words(new_dict)
word_cloud.to_file('Importance.jpg')

<img src='Importance.jpg' width="600" height="400">

In [None]:
x = [elem[1] for elem in feature_importance]
y = [elem[0] for elem in feature_importance]
plt.figure(figsize=(15,5))
plt.bar(x[:20], y[:20])

In [None]:
# Final models
forest_final = grid_search3.best_estimator_
sgd_final = grid_search.best_estimator_
lg_final = grid_search2.best_estimator_

In [None]:
# other metrics for models
from sklearn.model_selection import cross_val_predict

dic_preds = {}
for model in ['sgd_final', 'lg_final', 'forest_final']:
    y_train_pred = cross_val_predict(eval(model), train_set_prepared_predictors.toarray(),
                                     train_set_prepared_labels.toarray().ravel(), cv = 3)
    dic_preds[model] = y_train_pred 

In [None]:
from sklearn.metrics import confusion_matrix
dic_cm = {}
for model in dic_preds:
    dic_cm[model] = confusion_matrix(train_set_prepared_labels.toarray().ravel(), dic_preds[model])
dic_cm

for model in dic_cm:
    print(f'{model} cm')
    print(dic_cm[model])
    print('\n')

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
dic_scores = {}
for model in dic_cm:
    p = precision_score(train_set_prepared_labels.toarray().ravel(), dic_preds[model])
    r = recall_score(train_set_prepared_labels.toarray().ravel(), dic_preds[model])
    f = f1_score(train_set_prepared_labels.toarray().ravel(), dic_preds[model])
    dic_scores[model] = [p,r,f]
    
df = pd.DataFrame(dic_scores, index = ['Precision', 'Recall', 'F1 score'])
plt.matshow(df, cmap = plt.cm.gray)
df

In [None]:
# Precission, recall and thresholds
from sklearn.metrics import precision_recall_curve

dic_decision = {}
for model in ['sgd_final', 'lg_final', 'forest_final']:
    try: 
        decisions = cross_val_predict(eval(model), train_set_prepared_predictors.toarray(),
                                     train_set_prepared_labels.toarray().ravel(), cv = 3, method = "decision_function")
    except:
         print(model, 'usa predict_proba')
         decisions = cross_val_predict(eval(model), train_set_prepared_predictors.toarray(),
                                     train_set_prepared_labels.toarray().ravel(), cv = 3, method = "predict_proba")
    dic_decision[model] = decisions

In [None]:
dic_decision

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], 'b--', label = 'precision')
    plt.plot(thresholds, recalls[:-1], 'g-', label = 'recall')
    plt.grid(axis = 'both')
    
for model in ['sgd_final', 'lg_final', 'forest_final']:
    if model is 'forest_final':
        decisions = dic_decision[model][:,1]
    else: 
        decisions = dic_decision[model]

    precisions, recalls, thresholds = precision_recall_curve(train_set_prepared_labels.toarray().ravel(), decisions)
    f1score = []
    for precision, recall in zip(precisions, recalls):
        f = 2 *(precision * recall) / (precision + recall)
        f1score.append(f)
        
    plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
    plt.plot(thresholds, f1score[:-1], 'r-', label = "f1-score")
    plt.title(f'{model}, recall vs precision vs f1_score')
    plt.legend()
    plt.show()
    

In [None]:
 precisions, recalls, thresholds

In [None]:
def roc_curves(fpr, tpr, thresholds, model, area):
    plt.plot(fpr, tpr, label = f'{model} --> {area}')
    plt.plot([0,1], [0,1])
    plt.grid(axis = 'both')

In [None]:
#Roc Curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
plt.figure(figsize=(18,6))
for model in ['sgd_final', 'lg_final', 'forest_final']:
    areas = []
    if model == 'forest_final':
        decisions = 'predict_proba'
    else:
        decisions = 'decision_function'
        
    y_scores = cross_val_predict(eval(model), train_set_prepared_predictors.toarray(),
                                 train_set_prepared_labels.toarray().ravel(), cv = 3, method = decisions)
    
    if model == 'forest_final':
        y_scores = y_scores[:,1]
    fpr, tpr, thresholds = roc_curve(train_set_prepared_labels.toarray().ravel(), y_scores)
    area = roc_auc_score(train_set_prepared_labels.toarray().ravel(), y_scores)
    roc_curves(fpr, tpr, thresholds, model, area)
    plt.title(f'roc curve')
plt.legend()
plt.show()

In [None]:
np.round(fpr, 3), tpr, thresholds

In [None]:
# ¿More recall? --> less precision
thresholds_90recall = thresholds[np.argmax(recalls <= 0.9)]
recalls

In [None]:
thresholds_90recall # With that probability, we will obtain 0.9 recall

In [None]:
decisions = cross_val_predict(forest_clf, train_set_prepared_predictors.toarray(),
                         train_set_prepared_labels.toarray().ravel(), cv = 3, method = "predict_proba")

In [None]:
# Force to 0.9 recall
preds = (decisions[:,1] >= thresholds_90recall)
preds_ = [int(elem) for elem in list(preds)]
precision_score(train_set_prepared_labels.toarray().ravel(), preds_), recall_score(train_set_prepared_labels.toarray().ravel(), preds_)

In [None]:
# Predictions with test set
test_set_prepared = full_pipeline.transform(strat_test_set)
test_set_prepared = pd.DataFrame(test_set_prepared.toarray())

test_set_prepared_pred = test_set_prepared.drop(0, axis = 1)
test_set_prepared_labels = test_set_prepared.iloc[:,0]

In [None]:
dic_final_pred = {}
for model in ['sgd_final', 'lg_final', 'forest_final']:
    pred = eval(model).predict(test_set_prepared_pred)
    dic_final_pred[model] = pred
    scores = []
    p = precision_score(test_set_prepared_labels, pred)
    r = recall_score(test_set_prepared_labels, pred)
    f = f1_score(test_set_prepared_labels, pred)
    for i in [p, r, f]:
        scores.append(i)
    dic_final_pred[f'{model} scores'] = scores

In [None]:
dic_final_pred

In [None]:
def combinating_model_predictions(dic, pred_true):
    preds = []
    for model in ['sgd_final', 'lg_final', 'forest_final']:
        preds.append(list(dic[model]))
    pred_final = []
    for pred1, pred2, pred3 in zip(preds[0],preds[1],preds[2]):
        if pred1 == pred2 and pred1 == pred3:
            pred_final.append(pred1)
        elif pred1 == pred2 and pred1 != pred3:
            pred_final.append(pred1)
        elif pred1 == pred3 != pred2:
            pred_final.append(pred1)
        else:
            pred_final.append(pred2)
        
    p = precision_score(pred_true, pred_final)
    r = recall_score(pred_true, pred_final)
    f = f1_score(pred_true, pred_final)
            
    return pred, [p,r,f]

In [None]:
pred, scores = combinating_model_predictions(dic_final_pred, test_set_prepared_labels)
scores # Same as lg_final

In [None]:
# Some individual prediction with 3 classifiers
strat_test_set[-10:]

In [None]:
test_set_prepared[-10:]

In [None]:
sgd_final.predict(test_set_prepared_pred[-10:])

In [None]:
lg_final.predict(test_set_prepared_pred[-10:])

In [None]:
forest_final.predict(test_set_prepared_pred[-10:])

In [None]:
forest_final.predict_proba(test_set_prepared_pred[-10:])

In [None]:
# More predictions random
strat_test_set.reset_index(drop = 'True', inplace = True)
filas = [3,12,24,36,40,44,83] # spam 3, 23, 35, 40, 44, 65, 67, 83, 89, 94
strat_test_set.iloc[filas]

In [None]:
for index, row in strat_test_set.iloc[filas].iterrows():
    print(row['Message'])
    print('\n')

In [None]:
 prep = test_set_prepared_pred.iloc[filas]

In [None]:
forest_final.predict(prep)

In [None]:
pd.DataFrame(forest_final.predict_proba(prep), columns = ['no spam', 'spam'])