In [None]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')
nb_path = '/content/notebooks'
os.symlink('/content/drive/My Drive/Colab Notebooks', nb_path)
sys.path.insert(0,nb_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
###Installing directly in drive
#!pip install --target=$nb_path tensorflow-text
#print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
import warnings
warnings.filterwarnings('ignore')

# Loading Data Set

#### Source: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset

In [None]:
df=pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/spam.csv',encoding = "ISO-8859-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


# EDA (Exploratoryy Data Analysis)

In [None]:
new_df=df[['v1','v2']]
new_df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
new_df.rename(columns={'v1':'Result','v2':'Message'},inplace=True)
new_df.head()

Unnamed: 0,Result,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
new_df.Result.value_counts()

ham     4825
spam     747
Name: Result, dtype: int64

In [None]:
747/4825

0.15481865284974095

Only 15% are spam and remaining 85% are ham, this cause huge data imbalanace

### Sampling Unbalanced data

In [None]:
df_spam=new_df[new_df['Result']=='spam']
df_spam.shape

(747, 2)

In [None]:
df_ham=new_df[new_df['Result']=='ham']
df_ham.shape

(4825, 2)

In [None]:
# Downsampling no. of Ham to no. of spam length
df_ham_downsample=df_ham.sample(df_spam.shape[0])
df_ham_downsample.shape

(747, 2)

In [None]:
# Concatenate both ham and spam

new_df=pd.concat([df_spam,df_ham_downsample])
new_df['Result'].value_counts()

spam    747
ham     747
Name: Result, dtype: int64

### Modify Target Variable to binary

In [None]:
# Creating a new column to convert ham and spam to binary

new_df['Spam']=new_df['Result'].apply(lambda x: 1 if x=='spam' else 0)

new_df.sample(10) #Displaying 10 random sample

Unnamed: 0,Result,Message,Spam
2063,spam,"URGENT! Your mobile No *********** WON a å£2,0...",1
140,ham,"K, text me when you're on the way",0
1606,ham,Ok no prob... I'll come after lunch then...,0
5056,ham,Hey next sun 1030 there's a basic yoga course....,0
2377,spam,YES! The only place in town to meet exciting a...,1
2124,ham,Beautiful Truth against Gravity.. Read careful...,0
4541,spam,FreeMsg Hi baby wow just got a new cam moby. W...,1
4756,ham,"hey, looks like I was wrong and one of the kap...",0
2687,ham,Okie,0
3089,ham,Am going to take bath ill place the key in win...,0


# Model Building

In [None]:
# Train Test Split

X=new_df.Message
y=new_df.Spam

X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.25)

In [None]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(1120,) (374,) (1120,) (374,)


### Defining Preprocess and Bert Model

In [None]:
preprocess_url = "https://www.kaggle.com/models/tensorflow/bert/frameworks/tensorFlow2/variations/en-uncased-preprocess/versions/3?tfhub-redirect=true"
encoder_url = "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-l-12-h-768-a-12/versions/4"

bert_preprocess=hub.KerasLayer(preprocess_url)
bert_model=hub.KerasLayer(encoder_url)

### Create model using TF Functional API

In [None]:
# Bert layer

text_input=tf.keras.layers.Input(shape=(),dtype=tf.string,name='Messsages')
preprocess_input=bert_preprocess(text_input)
embedding_encoded=bert_model(preprocess_input)


# Neural network layer

x=tf.keras.layers.Dropout(0.1,name='DropOut')(embedding_encoded['pooled_output'])
final_output=tf.keras.layers.Dense(1,activation='sigmoid',name='Output')(x)

# define input and output for the model
model=tf.keras.Model(inputs=text_input, outputs=final_output)

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 Messsages (InputLayer)      [(None,)]                    0         []                            
                                                                                                  
 keras_layer (KerasLayer)    {'input_word_ids': (None,    0         ['Messsages[0][0]']           
                             128),                                                                
                              'input_mask': (None, 128)                                           
                             , 'input_type_ids': (None,                                           
                              128)}                                                               
                                                                                              

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
#with tf.device('/device:GPU:0'):
model.fit(X_train,y_train,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7b3a9882fe20>

In [None]:
model.evaluate(X_test, y_test)



[0.3497467041015625, 0.9144384860992432]

In [None]:
y_pred_arr = model.predict(X_test)
y_predd_arr = y_pred_arr.flatten()



In [None]:
import numpy as np

y_predicted = np.where(y_pred_arr > 0.5, 1, 0)
y_predicted

array([[0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
    

### Model Evaluation

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_predicted)
cm

array([[166,  21],
       [ 11, 176]])

In [None]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.94      0.89      0.91       187
           1       0.89      0.94      0.92       187

    accuracy                           0.91       374
   macro avg       0.92      0.91      0.91       374
weighted avg       0.92      0.91      0.91       374



###  Model Function Call

In [None]:
def spam_classifier(message):
  res=model.predict(message)

  if res>0.5:
    return "Spam"
  else:
    return "Ham"


In [None]:
spam_classifier(["Enter a chance to win $5000, hurry up, offer valid until march 31, 2021"])



'Spam'

In [None]:
spam_classifier(["Hey Sam, Are you coming for a cricket game tomorrow"])



'Ham'