In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize , sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset , DataLoader

warnings.filterwarnings("ignore")
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
df = pd.read_csv("/content/sample_data/sms_spam.csv")
df.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
len(df)

5574

In [None]:
df["text"][2]

"free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005. text fa to 87121 to receive entry question(std txt rate)t&c's apply 08452810075over18's"

In [None]:
ws = WordNetLemmatizer()

In [None]:
def split_word_review(data):
  lemma_word = []
  for i in range(len(data)):
    data[i] = data[i].lower()
    word_token = word_tokenize(data[i])
    clean_data = [i for i in word_token if i not in stopwords.words() and i.isalnum()]
    b = []
    for i in clean_data:
      b.append(ws.lemmatize(i))
    lemma_word.append(b)
  return lemma_word

In [None]:
lemma_word = split_word_review(df["text"])

In [None]:
lemma_word[0]

['go',
 'jurong',
 'point',
 'available',
 'bugis',
 'great',
 'world',
 'buffet',
 'got',
 'amore']

In [None]:
df["lemma_word"] = lemma_word
df.head()

Unnamed: 0,type,text,lemma_word
0,ham,"go until jurong point, crazy.. available only ...","[go, jurong, point, available, bugis, great, w..."
1,ham,ok lar... joking wif u oni...,"[ok, lar, joking, wif]"
2,spam,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,u dun say so early hor... u c already then say...,"[dun, say, early, hor, already, say]"
4,ham,"nah i don't think he goes to usf, he lives aro...","[think, go, usf, life, around, though]"


In [None]:
vocab = []
for i in df["lemma_word"]:
  for j in i:
    if j not in vocab:
      vocab.append(j)

In [None]:
vocab[:10]

['go',
 'jurong',
 'point',
 'available',
 'bugis',
 'great',
 'world',
 'buffet',
 'got',
 'amore']

In [None]:
#word_2_index {"word":"index"}
#index_2_word {"index":"word"}

In [None]:
def create_dict(words):
  word_to_int = {}
  int_to_word = {}
  for i in range(len(words)):
    word_to_int[words[i]] = i+1

  for i in range(len(words)):
    int_to_word[i+1] = words[i]

  return word_to_int , int_to_word

In [None]:
def create_dicr(words):
  words_to_int = {w:i+1 for i,w in enumerate(words)}
  int_to_word = {i:w for i,w in words_to_int.items()}

  return words_to_int , int_to_word

In [None]:
# a = ["abc","cve","tuv"]
# for i,z in enumerate(a):
#   print(i,z)

In [None]:
word_to_int , int_to_word = create_dict(vocab)

In [None]:
word_to_int["got"]

9

In [None]:
int_to_word[1]

'go'

In [None]:
# ["i","love", "my","country",""]
# ["i " ,"am","boy" ,"",""]
#["my","name","is","rehan","khan"]

In [None]:
max([len(i) for i in lemma_word])

73

In [None]:
lemma_word[1]

['ok', 'lar', 'joking', 'wif']

In [None]:
for i in lemma_word:
  print(i)
  break

['go', 'jurong', 'point', 'available', 'bugis', 'great', 'world', 'buffet', 'got', 'amore']


In [None]:
def pad_text(tokenize_words ,seq_length):
 
  reviews = []
  for i in tokenize_words:
    if len(i) >= seq_length:
      reviews.append(i[:seq_length])

    else:
      reviews.append([""]*(seq_length - len(i))+ i)

  return np.array(reviews)

In [None]:
len(vocab)
seq_length = 26

In [None]:
padded_sentences = pad_text(lemma_word,26)

In [None]:
padded_sentences[3]

array(['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', 'dun', 'say', 'early', 'hor', 'already', 'say'],
      dtype='<U34')

In [None]:
word_to_int[''] = 0
int_to_word[0] = ''

In [None]:
new_sent = []

for i in padded_sentences:
    word_list = []
    for j in i:
        word_list.append(word_to_int[j])
    new_sent.append(word_list)

new_sent= np.array(new_sent)

In [None]:
len(new_sent)

5574

In [None]:
class SentimentLSTM(nn.Module):


  def __init__(self,n_vocab,n_embed,n_hidden,n_output,n_layers,drop_p=0.25):
    super().__init__()

    self.n_vocab = n_vocab
    self.n_embed = n_embed
    self.n_hidden = n_hidden
    self.n_output = n_output
    self.n_layers = n_layers
    self.drop_p = drop_p

    self.embedding = nn.Embedding(n_vocab , n_embed)
    self.lstm = nn.LSTM(n_embed,n_hidden,n_layers,batch_first = True)
    self.dropout = nn.Dropout(drop_p)
    self.fc = nn.Linear(n_hidden,n_output)
    self.sigmoid = nn.Sigmoid()


  def forward(self,input_words):
    embedded_words = self.embedding(input_words)
    lstm_out , h = self.lstm(embedded_words)
    lstm_out = self.dropout(lstm_out)
    lstm_out = lstm_out.contiguous().view(-1,self.n_hidden) 
    fc_out = self.fc(lstm_out)
    sigmoid_out = self.sigmoid(fc_out)
    sigmoid_out = sigmoid_out.view(batch_size,-1) #batch size, total no of columns
    sigmoid_last = sigmoid_out[:,-1]
    return sigmoid_last , h

  def init_hidden(self,batch_size):

    device = "cpu"
    weights = next(self.parameters()).data
    h = (weights.new(self.n_layers , batch_size,self.n_hidden).zero_().to(device),
         weights.new(self.n_layers,batch_size,self.n_hidden).zero_().to(device))
    
    return h

In [None]:
#n_vocab,n_embed,n_hidden,n_output,n_layers
n_vocab = len(word_to_int)
n_embed = 10 #50-150
n_hidden = 100
n_output = 1
n_layers = 1 #2

net = SentimentLSTM(n_vocab,n_embed,n_hidden,n_output,n_layers)

In [None]:
net

SentimentLSTM(
  (embedding): Embedding(7376, 10)
  (lstm): LSTM(10, 100, batch_first=True)
  (dropout): Dropout(p=0.25, inplace=False)
  (fc): Linear(in_features=100, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [None]:
a = []

for i in df["type"]:
  if i =="ham":
    a.append(1)

  else:
    a.append(0)

df["type"] = a
df.head(10)

Unnamed: 0,type,text,lemma_word
0,1,"go until jurong point, crazy.. available only ...","[go, jurong, point, available, bugis, great, w..."
1,1,ok lar... joking wif u oni...,"[ok, lar, joking, wif]"
2,0,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,1,u dun say so early hor... u c already then say...,"[dun, say, early, hor, already, say]"
4,1,"nah i don't think he goes to usf, he lives aro...","[think, go, usf, life, around, though]"
5,0,freemsg hey there darling it's been 3 week's n...,"[freemsg, hey, darling, 3, week, word, back, l..."
6,1,even my brother is not like to speak with me. ...,"[even, brother, like, speak, treat, like, aid,..."
7,1,as per your request 'melle melle (oru minnamin...,"[request, melle, oru, minnaminunginte, nurungu..."
8,0,winner!! as a valued network customer you have...,"[winner, valued, network, customer, selected, ..."
9,0,had your mobile 11 months or more? u r entitle...,"[mobile, 11, month, r, entitled, update, lates..."


In [None]:
labels = np.array([int(i) for i in df["type"].values])

In [None]:
labels[:40]

array([1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1])

In [None]:
#train , test & validation
train_ratio = 0.8
valid_ratio = (1-train_ratio)/2
#test_ratio = (1 -(train_ratio+valid_ratio))

total = len(new_sent)
train_cutoff = int((total*train_ratio))
valid_cutoff = int((total*(1-valid_ratio))) #(50*(0.9)) 45


#numpy to tensor
train_x, train_y = torch.Tensor(new_sent[:train_cutoff]).long() , torch.Tensor(labels[:train_cutoff]).long()
valid_x , valid_y = torch.Tensor(new_sent[train_cutoff:valid_cutoff]).long() , torch.Tensor(labels[train_cutoff:valid_cutoff]).long()
test_x , test_y = torch.Tensor(new_sent[valid_cutoff:]).long() , torch.Tensor(labels[valid_cutoff:]).long()


train_data = TensorDataset(train_x,train_y)
valid_data = TensorDataset(valid_x ,valid_y)
test_data = TensorDataset(test_x,test_y)

batch_size = 1


train_loader = DataLoader(train_data,batch_size = batch_size,shuffle=True)
valid_loader = DataLoader(valid_data,batch_size = batch_size,shuffle=True)
test_loader = DataLoader(test_data,batch_size = batch_size,shuffle=True)

In [None]:
for i , j in train_loader:
  print(i.size(), j)
  break

torch.Size([1, 26]) tensor([1])


In [None]:
lr = 0.001

loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(),lr=lr)

In [None]:
def acc(pred,label):
  pred = torch.round(pred)
  return torch.sum(pred==label).item()

In [None]:
def train_sentiment(net,loss_fn,optimizer,epochs=10):

  for epoch in range(epochs):
    h = net.init_hidden(batch_size)
    train_loss = 0.0
    train_acc = 0.0
    valid_loss = 0.0
    valid_acc = 0.0


    net.train()

    for input,labels in train_loader:
      #init_hidden()
      output,h = net(input)
      optimizer.zero_grad()
      

      loss = loss_fn(output,labels.float())
      loss.backward()
      train_loss += loss.item()

      accuracy = acc(output,labels)

      train_acc += accuracy

      optimizer.step()

    for input,labels in valid_loader:
      output,h = net(input)
      

      loss = loss_fn(output,labels.float())
      valid_loss += loss.item()

      accuracy = acc(output,labels)

      valid_acc += accuracy

    print("Epoch {} , Train_acc : {:.2f} , Train_loss : {:.2f} , Valid_acc : {:.2f} , valid_loss{:.2f}".format(
        epoch+1 , train_acc/len(train_data) , train_loss/len(train_loader), valid_acc/len(valid_data) , valid_loss/len(valid_loader)))
    

    

In [None]:
train_sentiment(net,loss_fn,optimizer,epochs=10)

Epoch 1 , Train_acc : 0.91 , Train_loss : 0.24 , Valid_acc : 0.95 , valid_loss0.16
Epoch 2 , Train_acc : 0.96 , Train_loss : 0.13 , Valid_acc : 0.95 , valid_loss0.13
Epoch 3 , Train_acc : 0.98 , Train_loss : 0.08 , Valid_acc : 0.97 , valid_loss0.12
Epoch 4 , Train_acc : 0.98 , Train_loss : 0.06 , Valid_acc : 0.97 , valid_loss0.11
Epoch 5 , Train_acc : 0.99 , Train_loss : 0.04 , Valid_acc : 0.97 , valid_loss0.12
Epoch 6 , Train_acc : 0.99 , Train_loss : 0.03 , Valid_acc : 0.95 , valid_loss0.15
Epoch 7 , Train_acc : 0.99 , Train_loss : 0.02 , Valid_acc : 0.97 , valid_loss0.12
Epoch 8 , Train_acc : 1.00 , Train_loss : 0.01 , Valid_acc : 0.96 , valid_loss0.20
Epoch 9 , Train_acc : 1.00 , Train_loss : 0.00 , Valid_acc : 0.97 , valid_loss0.19
Epoch 10 , Train_acc : 1.00 , Train_loss : 0.00 , Valid_acc : 0.97 , valid_loss0.26


In [None]:
torch.save(net.state_dict(),"model.pkl")

In [None]:
net

SentimentLSTM(
  (embedding): Embedding(7376, 10)
  (lstm): LSTM(10, 100, batch_first=True)
  (dropout): Dropout(p=0.25, inplace=False)
  (fc): Linear(in_features=100, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [None]:
net.load_state_dict(torch.load("model.pkl"))

<All keys matched successfully>

In [None]:
net.eval()
test_acc = 0
test_loss = 0

for input , labels in test_loader:
  test_output , test_h = net(input)
  loss = loss_fn(test_output,labels.float())
  test_loss += loss.item()

  accuracy = acc(test_output,labels)

  test_acc += accuracy


print("Test Acuracy : ",test_acc/len(test_data))
print("Test Loss : ", test_loss/len(test_loader))


  

Test Acuracy :  0.9767025089605734
Test Loss :  0.17859448869621283


In [None]:
def preprocess_review(review):

  tokenize = word_tokenize(review)
  review = [i for i in tokenize if i not in stopwords.words() and i.isalnum()]

  if len(tokenize) >= seq_length:
    review = tokenize[:seq_length]

  else:
    review = ['0']*(seq_length - len(tokenize))+ tokenize # [0 ,0 ,0 ,a,b,c,d]
    
    #["a","b","c",'']
    #[0,a,b,c,'']


    #for , i , n , '', [0,0,0,a,b,c,d]

  final = []

  for token in review:
    try:
      final.append(word_to_int[token]) #[0,1,2,3,0]

    except:
      final.append(word_to_int[''])

  return final

In [None]:
def predict(review):
  net.eval()

  words = np.array([preprocess_review(review)])
  padded_words = torch.from_numpy(words)

  pred_loader = DataLoader(padded_words , batch_size=1,shuffle =True)

  for i in pred_loader:
    output = net(i)[0].item()



  msg = "This is a Ham email" if output >=0.5 else "This is a spam email"
  print(msg)
  print("prediction = " + str(output))

In [None]:
predict("free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005. text fa to 87121 to receive entry question(std txt rate)t&c's apply 08452810075over18's")

This is a spam email
prediction = 0.0002292121498612687
