In [None]:
import numpy as np
import pandas as pd
import re 

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import Dataset

In [None]:
data = pd.concat([pd.read_csv('/content/train.csv'),
                  pd.read_csv("/content/test.csv")],axis=0)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89536 entries, 0 to 8127
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweets  89534 non-null  object
 1   class   89527 non-null  object
dtypes: object(2)
memory usage: 2.0+ MB


In [None]:
data.head()

Unnamed: 0,tweets,class
0,Be aware dirty step to get money #staylight ...,figurative
1,#sarcasm for #people who don't understand #diy...,figurative
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative
3,@wilw Why do I get the feeling you like games?...,figurative
4,-@TeacherArthurG @rweingarten You probably jus...,figurative


In [None]:
np.unique(list(data["class"]))

array(['figurative', 'irony', 'nan', 'regular', 'sarcasm'], dtype='<U32')

In [None]:
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89527 entries, 0 to 8127
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweets  89527 non-null  object
 1   class   89527 non-null  object
dtypes: object(2)
memory usage: 2.0+ MB


In [None]:
class Vectorizer():
    def __init__(self,clean_pattern=None,max_features=None,stop_words=None):
        self.clean_pattern = clean_pattern
        self.max_features = max_features
        self.stopwords = stop_words
        self.tfidf = TfidfVectorizer(stop_words=self.stopwords,max_features=self.max_features)
        self.builded = False
        
    
    def _clean_texts(self,texts):
        
        cleaned = []
        for text in texts:
            if self.clean_pattern is not None:
                text = re.sub(self.clean_pattern," ",text)
            
            text = text.lower().strip()
            cleaned.append(text)
        
        return cleaned

    def _set_tfidf(self,cleaned_texts):
        self.tfidf.fit(cleaned_texts)
    
    def build_vectorizer(self,texts):
        cleaned_texts = self._clean_texts(texts)
        self._set_tfidf(cleaned_texts)
        self.builded = True
        
    def vectorizeTexts(self,texts):
        if self.builded:
            cleaned_texts = self._clean_texts(texts)
            return self.tfidf.transform(cleaned_texts)
        
        else:
            raise Exception("Vectorizer is not builded.")
            
            

In [None]:
x = list(data["tweets"])
y = list(data["class"])

In [None]:
vectorizer = Vectorizer("[^a-zA-Z0-9]",max_features=7000,stop_words="english");

In [None]:
vectorizer.build_vectorizer(x)

In [None]:
vectorized_x = vectorizer.vectorizeTexts(x).toarray()

In [None]:
vectorized_x.shape

(89527, 7000)

In [None]:
label_map = {
    "figurative":0,
    "sarcasm":1,
    "irony":2,
    "regular":3
}

In [None]:
y_encoded = []
for y_sample in y:
    y_encoded.append(label_map[y_sample])
    
y_encoded = np.asarray(y_encoded)

In [None]:
y_encoded.shape

(89527,)

In [None]:
class TweetDataset(Dataset):
    
    def __init__(self,x_vectorized,y_encoded):
        self.x_vectorized = x_vectorized
        self.y_encoded = y_encoded
        
    
    def __len__(self):
        return len(self.x_vectorized)
    
    
    def __getitem__(self,index):
        return self.x_vectorized[index],self.y_encoded[index]

In [None]:
dataset = TweetDataset(vectorized_x,y_encoded)
print("Length of our dataset is",len(dataset))

print(dataset[2])

Length of our dataset is 89527
(array([0., 0., 0., ..., 0., 0., 0.]), 0)


In [None]:
train_indices,test_indices = train_test_split(list(range(0,len(dataset))),test_size=0.25,random_state=42)

In [None]:
print(len(train_indices))
print(len(test_indices))

67145
22382


In [None]:
train_sampler = SubsetRandomSampler(train_indices)
test_sampler = SubsetRandomSampler(test_indices)

In [None]:
BATCH_SIZE = 128
train_loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, 
                                           sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE,
                                                sampler=test_sampler)

In [None]:
class DenseNetwork(nn.Module):
    
    def __init__(self):
        super(DenseNetwork,self).__init__()
        self.fc1 = nn.Linear(7000,1024)
        self.drop1 = nn.Dropout(0.4)
        self.fc2 = nn.Linear(1024,256)
        self.drop2 = nn.Dropout(0.4)
        self.prediction = nn.Linear(256,4)
        
    def forward(self,x):
        
        x = F.relu(self.fc1(x.to(torch.float)))
        x = self.drop1(x)
        x = F.relu(self.fc2(x))
        x = self.drop2(x)
        x = F.log_softmax(self.prediction(x),dim=1)
        
        return x

In [None]:
device = torch.device("cuda")
device

device(type='cuda')

In [None]:
pip install cuda-python

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
model = DenseNetwork()

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(model.parameters(),lr=1e-3)

In [None]:
EPOCHS = 6
TRAIN_LOSSES = []
TRAIN_ACCURACIES = []

for epoch in range(1,EPOCHS+1):
    epoch_loss = 0.0
    epoch_true = 0
    epoch_total = 0
    for data_,target_ in train_loader:
        # Cleaning optimizer cache.
        optimizer.zero_grad()
        
        # Forward propagation
        outputs = model(data_)
        
        # Computing loss & backward propagation
        loss = criterion(outputs,target_)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        _,pred = torch.max(outputs,dim=1)
        epoch_true = epoch_true + torch.sum(pred == target_).item()
        
        epoch_total += target_.size(0)
        
    TRAIN_LOSSES.append(epoch_loss)
    TRAIN_ACCURACIES.append(100 * epoch_true / epoch_total)
    
    print(f"Epoch {epoch}/{EPOCHS} finished: train_loss = {epoch_loss}, train_accuracy = {TRAIN_ACCURACIES[epoch-1]}")
    

Epoch 1/6 finished: train_loss = 287.21683526039124, train_accuracy = 72.1602502047807
Epoch 2/6 finished: train_loss = 260.79608631134033, train_accuracy = 73.83572864695807
Epoch 3/6 finished: train_loss = 255.35175228118896, train_accuracy = 74.21401444634746
Epoch 4/6 finished: train_loss = 245.90065678954124, train_accuracy = 74.91697073497654
Epoch 5/6 finished: train_loss = 235.4599128961563, train_accuracy = 76.0890609874153
Epoch 6/6 finished: train_loss = 223.23479411005974, train_accuracy = 77.31029860749125


In [None]:
test_true = 0
test_total = len(test_sampler)
test_loss = 0.0
with torch.no_grad():

        outputs = model(data_)
        
        loss = criterion(outputs,target_).item()
        
        _,pred = torch.max(outputs,dim=1)
        
        test_true += torch.sum(pred==target_).item()
        test_loss += loss
        

print(f"Validation finished: Accuracy = {round(100 * (test_true / test_total),2)}%, Loss = {test_loss}")


Validation finished: Accuracy = 0.28%, Loss = 0.34214940667152405


In [None]:
!jupyter nbconvert --to html ""