In [None]:
import numpy as np
import pandas as pd 
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Import dataset
dataset = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv", encoding = "latin-1")

In [None]:
dataset.head()

In [None]:
dataset = dataset.drop(columns = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"])


In [None]:
dataset.head()

In [None]:
dataset = dataset.rename(columns = {"v1" : "target", "v2" : "sms"})

In [None]:
dataset.head()


In [None]:
dataset["length"] = dataset["sms"].str.len()

In [None]:
dataset.head()


In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data = dataset, x="target")
print(dataset["target"].value_counts())

In [None]:
# Spam mails tend to have more lengthy messages!
plt.figure(figsize=(15,7))
plt.xlim(0,200)
sns.distplot(dataset.loc[dataset["target"] == "ham"]["length"], 
                     kde_kws={"label": "Ham"}, bins = 100)
sns.distplot(dataset.loc[dataset["target"] == "spam"]["length"], 
                     kde_kws={"label": "Spam"}, bins = 100)

In [None]:
#Create copy dataset to manipulate
manip_dataset = dataset.copy()


In [None]:
manip_dataset.head()


In [None]:
 manip_dataset["sms"][0]

In [None]:
import re
import nltk
nltk.download('stopwords') #download non relevant words
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer # Stemming is taking the root of every word (containing what it means)
sms = [] # will contain all the different reviews cleaned
for i in range(0, len(dataset)):
    string = re.sub("[^a-zA-Z]", " ", manip_dataset["sms"][i]) # replaces anything NOT in a-z or A-Z by a space, in the variable 
    string = string.lower()
    string = string.split()
    stemmer = SnowballStemmer("english")
    all_stopwords = stopwords.words("english")
    #if the word is not in the stopwords vocabulary then go ahead with the word iter and stem it
    string = [stemmer.stem(word) for word in string if not word in set(all_stopwords)]
    string = ' '.join(string) # joins the words again with a space in between them
    sms.append(string) # add the review to our corpus

In [None]:
type(sms)

In [None]:
# See the first 5 stemmed messages
sms[:5]

In [None]:
sms


In [None]:
for i in range(0,len(sms)):
    manip_dataset["sms"][i] = sms[i] 

In [None]:
manip_dataset.head()

In [None]:
manip_dataset["after_length"] = manip_dataset["sms"].str.len()

In [None]:
manip_dataset.head()

In [None]:
# Length distributions more discrete in initial length, so i will not use the after_length attr
plt.figure(figsize=(15,7))
plt.xlim(0,200)
sns.distplot(manip_dataset.loc[manip_dataset["target"] == "ham"]["after_length"], 
                     kde_kws={"label": "Ham"}, bins = 100)
sns.distplot(manip_dataset.loc[manip_dataset["target"] == "spam"]["after_length"], 
                     kde_kws={"label": "Spam"}, bins = 100)

In [None]:
manip_dataset = manip_dataset.drop(columns = ["after_length"])
manip_dataset.head()

In [None]:
# Reindexing our columns
manip_dataset = manip_dataset.reindex(columns = ["sms", "length", "target"])


In [None]:
manip_dataset.head()

In [None]:
# Encoding the target values
target_encoder = LabelEncoder()
manip_dataset["target"] = target_encoder.fit_transform(manip_dataset["target"])
manip_dataset.head()

In [None]:
X = manip_dataset.drop(columns = ["target"])
y = manip_dataset["target"]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 4001) # the one is because i'll use the last one for the length

In [None]:
# Create our vectors for our bag-of-words model
X_sms = cv.fit_transform(sms).toarray()

In [None]:
len(X_sms[0])

In [None]:
X_sms

In [None]:
# Assign the last value of each vector to the length feature
for i in range(0,len(X_sms)):
    X_sms[i][-1] = X["length"][i]

In [None]:
type(X_sms)

In [None]:
# Display first five vectors
X_sms[:5]

In [None]:
# Split our data to train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sms, y, test_size=0.20, random_state=42)

In [None]:
# Create function running models
def run_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
    print("The confusion matrix is : \n", confusion_matrix(y_test, y_pred), "\n")
    print("The accuracy score is : \n",accuracy_score(y_test, y_pred), "\n")
    print("The precision is : \n",precision_score(y_test,y_pred), "\n")
    print("The recall is : \n",recall_score(y_test,y_pred), "\n")
    print("The f1 score is : \n",f1_score(y_test,y_pred), "\n")

In [None]:
from sklearn.naive_bayes import GaussianNB
run_model(GaussianNB(),X_train, y_train, X_test, y_test)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
run_model(AdaBoostClassifier(),X_train, y_train, X_test, y_test)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
run_model(GradientBoostingClassifier(),X_train, y_train, X_test, y_test)


In [None]:
from sklearn.ensemble import RandomForestClassifier
run_model(RandomForestClassifier(),X_train, y_train, X_test, y_test)

In [None]:
model1=RandomForestClassifier()
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)

In [None]:
y_pred

In [None]:
 ex='Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate) T&Cs apply 08452810075over18s'

In [None]:
ex

In [None]:
# Stemming is taking the root of every word (containing what it means)
sms_ex = [] # will contain all the different reviews cleaned

string = re.sub("[^a-zA-Z]", " ", ex) # replaces anything NOT in a-z or A-Z by a space, in the variable 
string = string.lower()
string = string.split()
stemmer = SnowballStemmer("english")
all_stopwords = stopwords.words("english")
#if the word is not in the stopwords vocabulary then go ahead with the word iter and stem it
string = [stemmer.stem(word) for word in string if not word in set(all_stopwords)]
string = ' '.join(string) # joins the words again with a space in between them
sms_ex.append(string) # add the review to our corpus

In [None]:
sms_ex

In [None]:
type(sms_ex)

In [None]:
sms_copy=sms.copy()

In [None]:
sms_copy.append(sms_ex)

In [None]:
sms_copy[len(sms_copy)-1]

In [None]:
type(sms_copy)

In [None]:
type(sms)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv_new = CountVectorizer(max_features = 4001) # the one is because i'll use the last one for the length

In [None]:
X_sms_new = cv_new.fit_transform(sms).toarray()

In [None]:
X_sms_new

In [None]:
for i in range(0,len(X_sms)):
    X_sms_new[i][-1] = X["length"][i]

In [None]:
len(X_sms_new[0])

In [None]:
prediction= model1.predict(X_sms_new)

In [None]:
prediction[len(prediction)-1]