In [20]:
# import important modules
import numpy as np
import pandas as pd

# sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB # classifier 

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    plot_confusion_matrix,
)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# text preprocessing modules
from string import punctuation 

# text preprocessing modules
from nltk.tokenize import word_tokenize

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import re #regular expression

# Download dependency
for dependency in (
    "brown",
    "names",
    "wordnet",
    "averaged_perceptron_tagger",
    "universal_tagset",
):
    nltk.download(dependency)
    
import warnings
warnings.filterwarnings("ignore")
# seeding
np.random.seed(123)
from sklearn.decomposition import NMF


[nltk_data] Downloading package brown to
[nltk_data]     /Users/snizhanakurylyuk/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     /Users/snizhanakurylyuk/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/snizhanakurylyuk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/snizhanakurylyuk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/snizhanakurylyuk/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [21]:
df = pd.read_csv('SPAM text message 20170820 - Data.csv', encoding="latin-1")

In [22]:
df = df.drop_duplicates()
df = df.reset_index(inplace = False)[['Category','Message']]
print(df.shape)


(5157, 2)


In [23]:
df['labels'] = df['Category'].map({'ham':0, 'spam':1})
df

Unnamed: 0,Category,Message,labels
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5152,spam,This is the 2nd time we have tried 2 contact u...,1
5153,ham,Will Ã¼ b going to esplanade fr home?,0
5154,ham,"Pity, * was in mood for that. So...any other s...",0
5155,ham,The guy did some bitching but I acted like i'd...,0


In [24]:
spam_messages= df.loc[df.Category=="spam"]["Message"]
not_spam_messages= df.loc[df.Category=="ham"]["Message"]

print("spam count: " +str(len(df.loc[df.Category=="spam"])))
print("not spam count: " +str(len(df.loc[df.Category=="ham"])))

not_spam_messages

spam count: 641
not spam count: 4516


0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
6       Even my brother is not like to speak with me. ...
                              ...                        
5150                                         Huh y lei...
5153                Will Ã¼ b going to esplanade fr home?
5154    Pity, * was in mood for that. So...any other s...
5155    The guy did some bitching but I acted like i'd...
5156                           Rofl. Its true to its name
Name: Message, Length: 4516, dtype: object

# Text Preprocessing 

In [25]:
def preprocessing_text(texts):
    df["Clean_Message"] = df["Message"].str.lower() #puts everything in lowercase
    df["Clean_Message"] = df["Message"].replace(r'http\S+', '', regex=True) # removing any links 
    df["Clean_Message"] = df["Message"].replace(r'www.[^ ]+', '', regex=True)
    df["Clean_Message"] = df["Message"].replace(r'[0-9]+', " ", regex = True) #removing numbers
    df["Clean_Message"] = df["Message"].replace (r'[!"#$%&()*+,-./:;<=>?@[\]^_`{|}~]', '', regex=True) #remove special characters and puntuation marks 
    df["Clean_Message"] = df["Message"].replace(r"[^A-Za-z]", " ", regex = True) #replace any item that is not a letter
    

    return texts

In [26]:
texts = preprocessing_text(df)
texts


Unnamed: 0,Category,Message,labels,Clean_Message
0,ham,"Go until jurong point, crazy.. Available only ...",0,Go until jurong point crazy Available only ...
1,ham,Ok lar... Joking wif u oni...,0,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry in a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,0,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,Nah I don t think he goes to usf he lives aro...
...,...,...,...,...
5152,spam,This is the 2nd time we have tried 2 contact u...,1,This is the nd time we have tried contact u...
5153,ham,Will Ã¼ b going to esplanade fr home?,0,Will b going to esplanade fr home
5154,ham,"Pity, * was in mood for that. So...any other s...",0,Pity was in mood for that So any other s...
5155,ham,The guy did some bitching but I acted like i'd...,0,The guy did some bitching but I acted like i d...


In [27]:
stop_words =  stopwords.words('english')

def text_cleaning(text, remove_stop_words=True, lemmatize_words=True):
    # Clean the text, with the option to remove stop_words and to lemmatize word

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) # remove numbers
    text = text.lower()
    
        
    #Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if lemmatize_words:
        text = text.split()
        lemmatizer = WordNetLemmatizer() 
        lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
        text = " ".join(lemmatized_words)
    
    # Return a list of words
    return(text)

In [28]:
#clean the review
df["Clean_Message_2"]= df["Message"].apply(text_cleaning)

In [29]:
df

Unnamed: 0,Category,Message,labels,Clean_Message,Clean_Message_2
0,ham,"Go until jurong point, crazy.. Available only ...",0,Go until jurong point crazy Available only ...,go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,0,Ok lar Joking wif u oni,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry in a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts 21s...
3,ham,U dun say so early hor... U c already then say...,0,U dun say so early hor U c already then say,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,Nah I don t think he goes to usf he lives aro...,nah think go usf life around though
...,...,...,...,...,...
5152,spam,This is the 2nd time we have tried 2 contact u...,1,This is the nd time we have tried contact u...,2nd time tried contact u u pound prize claim e...
5153,ham,Will Ã¼ b going to esplanade fr home?,0,Will b going to esplanade fr home,b going esplanade fr home
5154,ham,"Pity, * was in mood for that. So...any other s...",0,Pity was in mood for that So any other s...,pity mood suggestion
5155,ham,The guy did some bitching but I acted like i'd...,0,The guy did some bitching but I acted like i d...,guy bitching acted like interested buying some...


# Modeling - TFIDF Vectorization


In [30]:
#TFIDF 

docs = df.Clean_Message_2
tfidf= TfidfVectorizer(stop_words= "english",
                       max_df = .4, 
                       min_df = 5, #maybe 5 or 6
                       max_features = 20000,
                       lowercase=True, 
                       analyzer='word',
                       ngram_range=(1,3),
                       dtype=np.float32)
doc_term_matrix = tfidf.fit_transform(docs) #should this be values?

#

In [31]:
X= df['Clean_Message_2']
y= df['labels']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state=42, shuffle = True, stratify = y)

In [33]:
from sklearn.neural_network import MLPClassifier
neural_net_pipeline = Pipeline([('vectorizer', tfidf), 
                                ('nn', MLPClassifier(hidden_layer_sizes=(700, 700)))])

In [36]:
neural_net_pipeline.fit(X_train, y_train)


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(dtype=<class 'numpy.float32'>, max_df=0.4,
                                 max_features=20000, min_df=5,
                                 ngram_range=(1, 3), stop_words='english')),
                ('nn', MLPClassifier(hidden_layer_sizes=(700, 700)))])

In [37]:
# Testing the Pipeline

y_pred = neural_net_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy: {} %'.format(100 * accuracy_score(y_test, y_pred)))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       678
           1       0.94      0.88      0.91        96

    accuracy                           0.98       774
   macro avg       0.96      0.93      0.95       774
weighted avg       0.98      0.98      0.98       774

Accuracy: 97.80361757105943 %


In [38]:
from joblib import dump
dump(neural_net_pipeline, 'spam_classifier.joblib')

['spam_classifier.joblib']