In [77]:
import pandas as pd
import numpy as np

In [78]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [79]:
from sklearn.feature_extraction.text import CountVectorizer

In [80]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader,TensorDataset,random_split

In [81]:
url='https://raw.githubusercontent.com/soumyajit4419/Deep_Learning_Projects/master/Sarcasm_Prediction_News_Headlines%20%5BBERT%20%E2%81%84%20LSTM%5D/dataset/headlines.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [82]:
df.shape

(26709, 3)

In [83]:
df.isnull().sum()

article_link    0
headline        0
is_sarcastic    0
dtype: int64

In [84]:
wordnet = WordNetLemmatizer()
def clean_text(text):
  text = re.sub('[^a-zA-Z]', ' ',text)
  text = text.lower()
  text = text.split(' ')
  text = [wordnet.lemmatize(word) for word in text if word not in stopwords.words('english')]
  text = ' '.join(text)
  return text

In [85]:
df['cleaned_text'] = df['headline'].apply(clean_text)

In [86]:
df.head()

Unnamed: 0,article_link,headline,is_sarcastic,cleaned_text
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,former versace store clerk sue secret black c...
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,roseanne revival catch thorny political mood...
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,mom starting fear son web series closest thing...
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,boehner want wife listen come alternative deb...
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,j k rowling wish snape happy birthday magical...


In [87]:
cv = CountVectorizer(max_df=0.99, min_df=0.005)

In [88]:
vect = cv.fit_transform(df['cleaned_text'])

In [89]:
vect = vect.toarray()

In [90]:
token2idx = cv.vocabulary_
vocab_size = len(token2idx)


idx2token = {id:token for token,id in token2idx.items()} 

In [120]:
inputs = torch.tensor(vect,dtype=torch.float32)
target = torch.tensor(df[['is_sarcastic']].values,dtype=torch.float32)

dataset = TensorDataset(inputs,target)

In [121]:
val_split = 0.2
val_size = int(len(dataset) * val_split)
train_size = len(dataset) - val_size

train_ds,val_ds = random_split(dataset,[train_size,val_size])

train_loader = DataLoader(train_ds,batch_size=128,shuffle=True,num_workers=3,pin_memory=True)
val_loader = DataLoader(val_ds,batch_size=128,shuffle=True,num_workers=3,pin_memory=True)

In [122]:
for input,labels in val_loader:
  print(inputs.shape,labels.shape)
  break

torch.Size([26709, 155]) torch.Size([128, 1])


In [123]:
# Defining the Model
class classification_model(nn.Module):

  def __init__(self):
    super().__init__()
    self.Linear1 = nn.Linear(vocab_size,128)
    self.Linear2 = nn.Linear(128,64)
    self.Linear3 = nn.Linear(64,1)

  def forward(self,xb):
    out = self.Linear1(xb)
    out = F.relu(out)
    out = self.Linear2(out)
    out = F.relu(out)
    out = self.Linear3(out)
    out = F.sigmoid(out)
    return out

In [124]:
# Initializing the model
model = classification_model()

In [125]:
# Moving data to GPU if available
def get_device():
  if torch.cuda.is_available():
    return torch.device('cuda')
  else:
    return torch.device('cpu')

device = get_device()
device

device(type='cpu')

In [126]:
model = model.to(device)

In [131]:
def fit(num_epochs):
  # Initailizing the optimizer
  optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
  train_losses=[]

  # Training for some epochs
  for epoch in range(num_epochs):
    losses = []
    for input,target in train_loader:
      # Moving the batches of data to device
      input = input.to(device)
      target = target.to(device)

      # Predicting the output
      out = model(input)

      # Calculating the loss
      loss = F.binary_cross_entropy(out,target)

      # Updating the weights and bias
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()
      losses.append(loss)

    # Epoch loss
    epoch_loss = torch.stack(losses).mean()
    
    #Appending all the epoch loss 
    train_losses.append(epoch_loss)

    print("Epoch: {} , Loss: {}".format(epoch+1,epoch_loss.item()))
  return train_losses

In [132]:
history = fit(10)



Epoch: 1 , Loss: 0.5766692757606506
Epoch: 2 , Loss: 0.5688551664352417
Epoch: 3 , Loss: 0.5611430406570435
Epoch: 4 , Loss: 0.550683319568634
Epoch: 5 , Loss: 0.5400444269180298
Epoch: 6 , Loss: 0.5289016366004944
Epoch: 7 , Loss: 0.518133819103241
Epoch: 8 , Loss: 0.5071132183074951
Epoch: 9 , Loss: 0.49874645471572876
Epoch: 10 , Loss: 0.4898260831832886


In [138]:
input,target = val_ds[0]
input

tensor([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [150]:
for input,target in val_loader:
  preds = model(input)
  break



In [153]:
preds = preds>0.5
from sklearn.metrics import classification_report
print(classification_report(preds.detach(),target))


              precision    recall  f1-score   support

       False       0.77      0.58      0.66        81
        True       0.49      0.70      0.58        47

    accuracy                           0.62       128
   macro avg       0.63      0.64      0.62       128
weighted avg       0.67      0.62      0.63       128



In [147]:
pred = model(input)
print(pred.item())
print(target)

0.40181946754455566
tensor([1.])


