In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Data

In [None]:
df = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv",encoding='latin-1')
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.columns = ['Label', 'SMS']
df['Label'] = df['Label'].map({'ham':0, 'spam':1})
df = df[['SMS', 'Label']]
df['len'] = df['SMS'].apply(len)
df.head()

# Data Visualization

In [None]:
sns.countplot(df['Label'])
plt.xlabel('Label')
plt.title('Number of ham and spam messages')
plt.show()

In [None]:
plt.figure(figsize=(12,8))
df.loc[df['Label']==0, 'len'].plot(bins=40, kind='hist', color='red',label='Ham messages',alpha=0.5)
df.loc[df['Label']==1, 'len'].plot(kind='hist', color='blue',label='Spam messages',alpha=0.5)
plt.legend()
plt.xlabel('Message Length')
plt.show()

## Word Clouds

In [None]:

import wordcloud
from nltk.corpus import stopwords
data_ham  = df[df['Label'] == 0].copy()
data_spam = df[df['Label'] == 1].copy()

def show_wordcloud(data_spam_or_ham, title):
    text = ' '.join(data_spam_or_ham['SMS'].astype(str).tolist())
    stopwords = set(wordcloud.STOPWORDS)
    
    fig_wordcloud = wordcloud.WordCloud(stopwords=stopwords,background_color='lightgrey',
                    colormap='viridis', width=800, height=600).generate(text)
    
    plt.figure(figsize=(10,7), frameon=True)
    plt.imshow(fig_wordcloud)  
    plt.axis('off')
    plt.title(title, fontsize=20 )
    plt.show()

In [None]:
show_wordcloud(data_ham, "Ham messages")


In [None]:
show_wordcloud(data_spam, "Spam messages")


In [None]:
X = df['SMS'].values
y = df['Label'].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Model-1 : Naive Bayes

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

#X_train[:,X_train.sum(axis=0)>1]
cv = CountVectorizer(max_features=3700) #feature that accure more than one time

X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Model-2 Artificial Neural Network(ANN)

In [None]:
import tensorflow as tf

ann = tf.keras.models.Sequential()
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))     # Hidden layer
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))     # Hidden layer
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))  # Output layer
ann.compile(optimizer='adam', loss = 'binary_crossentropy', metrics = ['accuracy']) #Compiling
ann.fit(X_train, y_train, batch_size=32, epochs=10)

In [None]:
y_pred = (ann.predict(X_test) > 0.5)

cm = confusion_matrix(y_test, y_pred)
print(cm,'\n')
print(accuracy_score(y_test, y_pred),'\n')
print(classification_report(y_test, y_pred))

# Model save and reload

- for futur usage, we can save the model as .pkl file

In [None]:
import joblib 
joblib.dump(clf, 'NB_Spam_Model.pkl')

# Reload

NB_Spam_Model = open('NB_Spam_Model.pkl', 'rb')
clf = joblib.load(NB_Spam_Model)