In [None]:
cd//content/drive/MyDrive/Colab Notebooks/Assignment4

/content/drive/MyDrive/Colab Notebooks/Assignment4


In [None]:
!unzip archive.zip

Archive:  archive.zip
replace spam.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


**Importing libraries**

In [None]:
import pandas as pd
import numpy as np
import tensorflow
import seaborn as sns
import matplotlib.pyplot as plt
import keras
import re
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
data=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Assignment4/spam.csv',delimiter=',', encoding="ISO-8859-1") 

In [None]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
data.shape

(5572, 5)

Preprocessing

In [None]:
data=data.drop(columns=["Unnamed: 2","Unnamed: 3","Unnamed: 4"])

In [None]:
data=data.rename({"v1":"Category","v2":"Message"},axis=1)

In [None]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
data["Message Length"]=data["Message"].apply(len)

In [None]:
ham_desc=data[data["Category"]=="ham"]["Message Length"].describe()
spam_desc=data[data["Category"]=="spam"]["Message Length"].describe()

In [None]:
data.describe(include="all")

Unnamed: 0,Category,Message,Message Length
count,5572,5572,5572.0
unique,2,5169,
top,ham,"Sorry, I'll call later",
freq,4825,30,
mean,,,80.118808
std,,,59.690841
min,,,2.0
25%,,,36.0
50%,,,61.0
75%,,,121.0


In [None]:
data["Category"].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [None]:
minority_len=len(data[data["Category"]=="spam"])
majority_len=len(data[data["Category"]=="ham"])

In [None]:
minority_indices=data[data["Category"]=="spam"].index
majority_indices=data[data["Category"]=="ham"].index

In [None]:
random_majority_indices=np.random.choice(majority_indices,size=minority_len,replace=False)

In [None]:
undersampled_indices=np.concatenate([minority_indices,random_majority_indices])

In [None]:
df=data.loc[undersampled_indices]

In [None]:
df=df.sample(frac=1)

In [None]:
df=df.reset_index()

In [None]:
df=df.drop(columns=["index"],)

We balanced our ham and spam data

In [None]:
df["Category"].value_counts()

ham     747
spam    747
Name: Category, dtype: int64

In [None]:
df["Label"]=df["Category"].map({"ham":0,"spam":1})

In [None]:
df.head()

Unnamed: 0,Category,Message,Message Length,Label
0,ham,Your gonna have to pick up a $1 burger for you...,108,0
1,ham,"Yo, any way we could pick something up tonight?",47,0
2,spam,Please CALL 08712402972 immediately as there i...,81,1
3,ham,"Not much, just some textin'. How bout you?",42,0
4,ham,O i played smash bros &lt;#&gt; religiously.,46,0


In [None]:
pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import nltk

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()

**Create model**

In [None]:
   
corpus=[]

#iterate through the df["Message"]
for message in df["Message"]:

    
    #replace every special characters, numbers etc.. with whitespace of message
    #It will help retain only letter/alphabets
  message=re.sub("[^a-zA-Z]"," ",message)
    
    #convert every letters to its lowercase
  message=message.lower()
  message=message.split()
    
    #perform stemming using PorterStemmer for all non-english-stopwords
  message=[stemmer.stem(words)
            for words in message
             if words not in set(stopwords.words("english"))
            ]
    #join the word lists with the whitespace
  message=" ".join(message)
    
    #append the message in corpus list
  corpus.append(message)

In [None]:
from tensorflow.keras.preprocessing.text import one_hot
vocab_size=10000
oneHot_doc=[one_hot(words,n=vocab_size)
           for words in corpus]

In [None]:
df["Message Length"].describe()

count    1494.000000
mean      104.522758
std        57.435568
min         2.000000
25%        49.250000
50%       117.000000
75%       153.000000
max       632.000000
Name: Message Length, dtype: float64

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
sentence_len=200
embedded_doc=pad_sequences(oneHot_doc,maxlen=sentence_len,padding="pre")

In [None]:
extract_features=pd.DataFrame(data=embedded_doc)
target=df["Label"]

In [None]:
df_final=pd.concat([extract_features,target],axis=1)

In [None]:
#split the dataframe into dependent and independent variables
X=df_final.drop("Label",axis=1)
y=df_final["Label"]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_trainval,X_test,y_trainval,y_test=train_test_split(X,y,random_state=42,test_size=0.15)

In [None]:
X_train,X_val,y_train,y_val=train_test_split(X_trainval,y_trainval,random_state=42,test_size=0.15)

Build the model

In [None]:
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential

In [None]:
model=Sequential()

In [None]:
feature_num=100
model.add(Embedding(input_dim=vocab_size,output_dim=feature_num,input_length=sentence_len))
model.add(LSTM(units=128))
model.add(Dense(units=1,activation="sigmoid"))

Compile the model

In [None]:
from tensorflow.keras.optimizers import Adam
model.compile(optimizer=Adam(learning_rate=0.001),loss="binary_crossentropy",metrics=["accuracy"])

fit the model

In [None]:
model.fit(X_train,y_train,validation_data=(X_val,y_val),epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f25f7320050>

In [None]:
y_pred=model.predict(X_test)
y_pred=(y_pred>0.5)



test the model

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [None]:
score=accuracy_score(y_test,y_pred)
print("Test Score:{:.2f}%".format(score*100))

Test Score:96.44%


In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def classify_message(model,message):
    for sentences in message:
        sentences=nltk.sent_tokenize(message)
        for sentence in sentences:
            #replace all special characters
            words=re.sub("[^a-zA-Z]"," ",sentence)
            if words not in set(stopwords.words('english')):
                word=nltk.word_tokenize(words)
                word=" ".join(word)
        oneHot=[one_hot(word,n=vocab_size)]
    text=pad_sequences(oneHot,maxlen=sentence_len,padding="pre")
    predict=model.predict(text)
    
    #if predict value is greater than 0.5 its a spam
    if predict>0.5:
        print("It is a spam")
    #else the message is not a spam    
    else:
        print("It is not a spam")

In [None]:
message1="I am having a bad day and I would like to have a break today"
message2="This is to inform you had won a lottery and the subscription will end in a week so call us."

In [None]:
import nltk

In [None]:
classify_message(model,message1)

It is not a spam


In [None]:
classify_message(model,message2)

It is a spam
