In [None]:
import numpy as np
import pandas as pd 
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
import requests
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Import dataset
dataset = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv", encoding = "latin-1")

In [None]:
dataset.head()

In [None]:
# Drop unnecessary columns
dataset = dataset.drop(columns = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"])

In [None]:
dataset.head()

In [None]:
# Rename our 2 columns to make them more readable and meaningfull
dataset = dataset.rename(columns = {"v1" : "target", "v2" : "sms"})

In [None]:
dataset.head()

In [None]:
dataset["length"] = dataset["sms"].str.len()

In [None]:
dataset.head()

In [None]:
# How many spams and how many hams?
plt.figure(figsize=(10,5))
sns.countplot(data = dataset, x="target")
print(dataset["target"].value_counts())

In [None]:
# Spam mails tend to have more lengthy messages!
plt.figure(figsize=(15,7))
plt.xlim(0,200)
sns.distplot(dataset.loc[dataset["target"] == "ham"]["length"], 
                     kde_kws={"label": "Ham"}, bins = 100)
sns.distplot(dataset.loc[dataset["target"] == "spam"]["length"], 
                     kde_kws={"label": "Spam"}, bins = 100)

In [None]:
#Create copy dataset to manipulate
manip_dataset = dataset.copy()

In [None]:
manip_dataset.head()

In [None]:
import re
import nltk
nltk.download('stopwords') #download non relevant words
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer # Stemming is taking the root of every word (containing what it means)
sms = [] # will contain all the different sms cleaned
for i in range(0, len(dataset)):
    string = re.sub("[^a-zA-Z]", " ", manip_dataset["sms"][i]) # replaces anything NOT in a-z or A-Z by a space, in the variable 
    string = string.lower()
    string = string.split()
    stemmer = SnowballStemmer("english")
    all_stopwords = stopwords.words("english")
    #if the word is not in the stopwords vocabulary then go ahead with the word iter and stem it
    string = [stemmer.stem(word) for word in string if not word in set(all_stopwords)]
    string = ' '.join(string) # joins the words again with a space in between them
    sms.append(string) # add the cleaned sms to our sms list


In [None]:
# See the first 5 stemmed messages
sms[:5]

In [None]:
# Iterate through the list and replace the old texts with the new cleaned texts
for i in range(0,len(sms)):
    manip_dataset["sms"][i] = sms[i] 

In [None]:
manip_dataset.head()

In [None]:
# Creating feature for the length of the "cleaned" messages
manip_dataset["after_length"] = manip_dataset["sms"].str.len()

In [None]:
manip_dataset.head()

In [None]:
# Length distributions more discrete in initial length, so i will not use the after_length attr
fig, ax =plt.subplots(1,2,figsize=(25,5))
ax[0].set_xlim([0, 200])
ax[1].set_xlim([0, 200])


sns.distplot(manip_dataset.loc[manip_dataset["target"] == "ham"]["after_length"], 
                     kde_kws={"label": "Ham"}, bins = 100, ax = ax[0])
sns.distplot(manip_dataset.loc[manip_dataset["target"] == "spam"]["after_length"], 
                     kde_kws={"label": "Spam"}, bins = 100, ax = ax[0])

sns.distplot(manip_dataset.loc[manip_dataset["target"] == "ham"]["length"], 
                     kde_kws={"label": "Ham"}, bins = 100, ax = ax[1])
sns.distplot(manip_dataset.loc[manip_dataset["target"] == "spam"]["length"], 
                     kde_kws={"label": "Spam"}, bins = 100, ax = ax[1])

In [None]:
manip_dataset = manip_dataset.drop(columns = ["after_length"])

In [None]:
manip_dataset.head()

In [None]:
# Reindexing our columns
manip_dataset = manip_dataset.reindex(columns = ["sms", "length", "target"])

In [None]:
manip_dataset.head()

In [None]:
# Encoding the target values
target_encoder = LabelEncoder()
manip_dataset["target"] = target_encoder.fit_transform(manip_dataset["target"])
manip_dataset.head()

In [None]:
# Taking the spam stemmed words to put them in a word cloud
spam_words = ""
for val in manip_dataset.loc[manip_dataset["target"] == 1]["sms"]: 
    val = str(val)
    tokens = val.split()
    spam_words += " ".join(tokens)+" "

In [None]:
# Taking the spam stemmed words to put them in a word cloud
ham_words = ""
for val in manip_dataset.loc[manip_dataset["target"] == 0]["sms"]: 
    val = str(val)
    tokens = val.split()
    ham_words += " ".join(tokens)+" "

In [None]:
# Downloading the pic to use and defining our spam_word cloud
pic = np.array(Image.open(requests.get('http://www.clker.com/cliparts/O/i/x/Y/q/P/yellow-house-hi.png',stream=True).raw))
spam_wordcloud = WordCloud(width = 800, height = 800,
                      background_color ='white', mask = pic, 
                      min_font_size = 10).generate(spam_words)

In [None]:
# Defining our ham_word cloud
ham_wordcloud = WordCloud(width = 800, height = 800,
                      background_color ='white', mask = pic, 
                      min_font_size = 10).generate(ham_words)

In [None]:
# Displaying the word cloud of most frequent stemmed spam sms messages words.
plt.figure(figsize = (8, 8), facecolor = 'white', edgecolor='blue') 
plt.imshow(spam_wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show()

In [None]:
# Displaying the word cloud of most frequent stemmed ham sms messages words.
plt.figure(figsize = (8, 8), facecolor = 'white', edgecolor='blue') 
plt.imshow(ham_wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show()

#### Since i'm new and haven't done a word cloud before, I found this implementation to apply here. If you want you can check it out!
https://medium.com/@harinisureshla/wordclouds-basics-of-nlp-5b60be226414

In [None]:
X = manip_dataset.drop(columns = ["target"])
y = manip_dataset["target"]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 4001) # the one at the end is because i'll use the last one for the length feature

In [None]:
# Create our vectors for our bag-of-words model
X_sms = cv.fit_transform(sms).toarray()

In [None]:
# Assign the last value of each vector to the length feature
for i in range(0,len(X_sms)):
    X_sms[i][-1] = X["length"][i]

In [None]:
# Display first five vectors
X_sms[:5]

In [None]:
# Split our data to train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sms, y, test_size=0.20, random_state=42)

In [None]:
# Function to print metrics
def print_metrics(y_test,y_pred):
    print("The confusion matrix is : \n", confusion_matrix(y_test, y_pred), "\n")
    print("The accuracy score is : \n",accuracy_score(y_test, y_pred), "\n")
    print("The precision is : \n",precision_score(y_test,y_pred), "\n")
    print("The recall is : \n",recall_score(y_test,y_pred), "\n")
    print("The f1 score is : \n",f1_score(y_test,y_pred), "\n")

In [None]:
# Create function running models and printing scores
def run_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print_metrics(y_test,y_pred)

In [None]:
# Create function running randomized search on model given in the parameters
def run_grid_model(model, grid, X_train, y_train, X_test, y_test):
    rf_random = RandomizedSearchCV(estimator = model, param_distributions = grid, n_iter = 100, cv = 2, verbose=2, random_state=42, n_jobs = -1)
    # Fit the random search model
    rf_random.fit(X_train, y_train)
    print(f"Best params of the randomized CV-2 model is : {rf_random.best_params_} \n-----------------------------------")
    y_pred = rf_random.predict(X_test)
    print_metrics(y_test,y_pred)

In [None]:
from sklearn.naive_bayes import GaussianNB
run_model(GaussianNB(),X_train, y_train, X_test, y_test)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
run_model(AdaBoostClassifier(random_state = 42),X_train, y_train, X_test, y_test)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
run_model(GradientBoostingClassifier(random_state = 42),X_train, y_train, X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
run_model(RandomForestClassifier(random_state = 42),X_train, y_train, X_test, y_test)

#### As we can see by the above metrics our best model, RandomForest Classifier, detected spam sms messages with accuracy 97,9%.
#### Also we can see that all the spams we detected, were indeed ACTUALLY spams (precision = 100%) but we missed approximately 15% (recall = 84.66%) of all the spam messages.

In [None]:
# Create the random grid
random_grid = {'n_estimators': [int(x) for x in np.linspace(200, 2000, num = 7)],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [int(x) for x in np.linspace(5, 100, num= 7)],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4],
               'bootstrap': [True, False]}

In [None]:
# Perform RandomizedSearch on our random forest classifier
run_grid_model(RandomForestClassifier(random_state = 42),random_grid, X_train, y_train, X_test, y_test)

#### After RandomizedSearch on our RandomForestClassifier which performed better with default parameters, we managed to reach 98.2% accuracy and increase the recall (86.6%). Precision stayed the same at 100%