In [None]:
from google.colab import files
files.upload() #upload kaggle.json

!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
kaggle.json


In [None]:
!kaggle datasets download -d kazanova/sentiment140


Downloading sentiment140.zip to /content
 94% 76.0M/80.9M [00:00<00:00, 69.3MB/s]
100% 80.9M/80.9M [00:00<00:00, 98.0MB/s]


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torchtext.vocab import GloVe
import torch
from torchtext import data
from torch.utils.data import DataLoader,Dataset
from nltk.stem import WordNetLemmatizer,PorterStemmer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
import nltk
import pickle
from torch import nn
import torch.nn.functional as F
import string

from sklearn.metrics import roc_auc_score


In [None]:
DATASET_ENCODING = "ISO-8859-1"
SEED = 2020
EMBED_SIZE = 200
MAX_FE = 1000
MAX_LEN = 96

torch.manual_seed(SEED)
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.deterministic = True

## Reading the Dataset

In [None]:
def preprocess_text(df):
    '''Function to preprocess and create corpus'''
    new_corpus=[]
    vocab={}

    for text in tqdm(df["tweet"]):
        tweet = text.maketrans(string.punctuation, ' '*len(string.punctuation))
        words=[w.translate(tweet) for w in re.sub(r'^@[a-zA-Z]+(_[a-zA-Z]+)?','',text).split() ]
        for word in words:
          try:  
            vocab[word]+=1
          except:
            vocab[word]=1

        new_corpus.append(words)
    return new_corpus,vocab



In [None]:
def tokenizer(corpus,mode='train'):

  model_save_name = 'tokenizer.pickle'
  path = F"{model_save_name}" 
     

  if mode=="train":
    tokenizer_obj=Tokenizer()
    tokenizer_obj.fit_on_texts(corpus)
    word_index=tokenizer_obj.word_index

    with open(path,'wb') as tok:
      pickle.dump(tokenizer_obj,tok,protocol=pickle.HIGHEST_PROTOCOL)

  else:
 
    word_index=None
    with open('tokenizer.pickle','rb') as tok:
       tokenizer= pickle.load(tok)

  
  sequences=tokenizer_obj.texts_to_sequences(corpus)
  tweet_pad=pad_sequences(sequences,
                            maxlen=MAX_LEN,
                            truncating='post',
                            padding='post')
    
  return tweet_pad,word_index 
    

In [None]:
def load_glove():
    
    embedding_dict={}
    with open('/content/glove.840B.200d.txt') as f:
        for line in tqdm(f):
            values=line.split()
            word=values[0]
            try:
              vectors=np.asarray(values[1:],'float32')
              embedding_dict[word]=vectors
            except:
              continue
    f.close()
    
    return embedding_dict




In [None]:
from collections import defaultdict
def prepare_matrix(word_index):
    embedding_dict = GloVe("twitter.27B",dim=200)
    iiv= defaultdict(int)
    oov= defaultdict(int)
    num_words = len(word_index)
    embedding_matrix = np.zeros((num_words+1, 200))

    for word, i in tqdm(word_index.items()):
        if i > num_words:
            continue

        
        emb_vec = embedding_dict[word]
        if not torch.equal(emb_vec,torch.zeros((200),dtype=torch.float)):
          embedding_matrix[i] = emb_vec
          iiv[word]=vocab[word]

        elif torch.equal(embedding_dict[word.lower()],torch.zeros((200),dtype=torch.float)):
          emb_vec = embedding_dict[word.lower()]
          embedding_matrix[i] = emb_vec
          iiv[word]=vocab[word]

        elif torch.equal(embedding_dict[word.title()],torch.zeros((200),dtype=torch.float)):
          emb_vec = embedding_dict[word.title()]
          embedding_matrix[i] = emb_vec
          iiv[word]=vocab[word]

        else:
          oov[word] = vocab[word]

    return embedding_matrix,iiv,oov

#### Model taken from https://www.kaggle.com/artgor/text-modelling-in-pytorch

In [None]:
class TweetModel(nn.Module):

    def __init__(self, embedding_matrix, lstm_hidden_size=200, gru_hidden_size=128):

        super(TweetModel,self).__init__()
        self.embedding = nn.Embedding(*embedding_matrix.shape)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = True
        self.embedding_dropout = nn.Dropout2d(0.1)

        self.gru = nn.GRU(embedding_matrix.shape[1] , gru_hidden_size,num_layers=1, bidirectional=True, batch_first=True)

        self.dropout2 = nn.Dropout(0.25)
        self.Linear1 = nn.Linear(gru_hidden_size*5,16)
        self.Linear2 = nn.Linear(16,1)


    def forward(self, x):
        batch_size = x.size(0)
        h_embedding = self.embedding(x)

        x, (x_h,x_c) = self.gru(h_embedding)

        avg_pool = torch.mean(x, 1)
        max_pool, _ = torch.max(x, 1)
        concat = torch.cat((avg_pool,x_h,max_pool), 1)
        concat = self.Linear1(concat)
        out = torch.sigmoid(self.Linear2(concat))
        return out

## Datagenerators

In [None]:
class TweetDataset(Dataset):

  def __init__(self,train,targets=None,mode="train"):

    self.train = train
    self.mode=mode
    self.targets = targets
    
  def __len__(self):

    return len(self.train)

  def __getitem__(self,idx):

    x_train_fold =  torch.tensor(self.train[idx],dtype=torch.long).to(DEVICE)
    if self.mode=='train':
        y_train_fold = torch.tensor(self.targets[idx],dtype=torch.float32).to(DEVICE)
        return x_train_fold, y_train_fold
    else:
        return x_train_fold,0



## Train Model

In [None]:
def train_model(train,target,embedding_matrix,nepochs=5,batch_size=64,test_split=0.05):

      model_save_name = 'classifier.pt'
      path = F"{model_save_name}" 
     
      X_train,X_test,y_train,y_test = train_test_split(train,target,test_size = test_split)

      train_data = TweetDataset(X_train,y_train)
      test_data = TweetDataset(X_test,y_test)

      dataloaders= {"train":DataLoader(train_data,batch_size=batch_size,shuffle=True),
                    "valid":DataLoader(test_data,batch_size=batch_size,shuffle=True)}

      model = TweetModel(embedding_matrix).to(DEVICE)
      loss_fn = torch.nn.BCELoss().cuda()
      optimizer = torch.optim.Adam(model.parameters(),lr=1e-3,weight_decay=1e-5)
      scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, eps=1e-4, verbose=True)

      best_loss = {'train':np.inf,
                  "valid":np.inf}

      for epoch in range(nepochs):

        epoch_loss = {"train":0.00,
                      "valid":0.00}

        for phase in ['train','valid']:

              if phase=="train":
                model = model.train()
              else:
                model= model.eval()

              running_loss = 0.00

              for i,(x,y) in enumerate(dataloaders['train']):

                  optimizer.zero_grad()
                  with torch.set_grad_enabled(phase=="train"):

                    predict = model(x).squeeze()
                    loss = loss_fn(predict,y)



                    if phase == "train":

                      loss.backward()
                      optimizer.step()

                  running_loss+= loss.item()/len(dataloaders[phase])

                  epoch_loss[phase]=running_loss

        
        print("Epoch {}/{}   -   loss: {:5.5f}   -   val_loss: {:5.5f}".format(epoch+1, nepochs, epoch_loss['train'], epoch_loss['valid']))
        scheduler.step(epoch_loss['valid'])
        if epoch_loss['valid'] < best_loss['valid']:

          print("saving model...")
          best_loss = epoch_loss
          torch.save(model.state_dict(),path,_use_new_zipfile_serialization=False)


    










## Main

In [None]:
from sklearn.utils import shuffle
import re

In [None]:
df = pd.read_csv("/content/sentiment140.zip",encoding=DATASET_ENCODING)
df= df.iloc[:,[0,-1]]
df.columns = ['sentiment','tweet']
df = pd.concat([df.query("sentiment==0").sample(120000),df.query("sentiment==4").sample(120000)])
df.sentiment = df.sentiment.map({0:1,4:0})
df =  shuffle(df).reset_index(drop=True)

In [None]:
df.sentiment.value_counts()

1    120000
0    120000
Name: sentiment, dtype: int64

In [None]:
corpus,vocab = preprocess_text(df)

100%|██████████| 240000/240000 [00:04<00:00, 59985.97it/s]


In [None]:
len(vocab)

231943

In [None]:
tweet_pad,word_index = tokenizer(corpus)

In [None]:

#embedding_dict = load_glove()


In [None]:
embedding_matrix = prepare_matrix(vocab)

.vector_cache/glove.twitter.27B.zip: 1.52GB [11:40, 2.17MB/s]                            
100%|█████████▉| 1192133/1193514 [01:22<00:00, 15087.15it/s]
  0%|          | 0/231943 [00:00<?, ?it/s][A
  1%|▏         | 3442/231943 [00:00<00:06, 34416.83it/s][A
  3%|▎         | 6816/231943 [00:00<00:06, 34203.19it/s][A
  4%|▍         | 10186/231943 [00:00<00:06, 34048.14it/s][A
  6%|▌         | 13750/231943 [00:00<00:06, 34508.21it/s][A
  8%|▊         | 17500/231943 [00:00<00:06, 35353.23it/s][A
  9%|▉         | 21151/231943 [00:00<00:05, 35690.91it/s][A
 11%|█         | 24433/231943 [00:00<00:05, 34775.33it/s][A
 12%|█▏        | 27636/231943 [00:00<00:06, 29222.55it/s][A
 13%|█▎        | 30517/231943 [00:00<00:07, 28625.58it/s][A
 15%|█▍        | 34335/231943 [00:01<00:06, 30948.09it/s][A
 16%|█▋        | 38062/231943 [00:01<00:05, 32606.83it/s][A
 18%|█▊        | 41577/231943 [00:01<00:05, 33327.15it/s][A
 20%|█▉        | 45252/231943 [00:01<00:05, 34284.25it/s][A
 21%|██     

In [None]:
df.sample(10)

Unnamed: 0,sentiment,tweet
190784,0,@kyteacher for printing/viewing 2/2
79390,0,@aNorthernSoul That's a nice pic. Thx for shar...
152020,0,is reading 7th Period is a Secret. http://plu...
20810,0,twiter twitter guyys....we r goiin crazy in bu...
35023,0,@bethanyvarrone aj holyfield yeahhh hes sexyy
219591,0,@Emsy I'm sure they're ALL still asleep! You'...
58051,0,Jst had an economics exam that went well and n...
187498,1,@manny138 i miss not seeing you at 8
91064,1,@courtcothren we miss you... wish we could hea...
120092,0,@mileycyrus: hey miley!! I just wanted to let ...


## Train

In [None]:
X_train,X_test,y_train,y_test = train_test_split(tweet_pad,df.sentiment.values,test_size=0.1)

In [None]:
train_model(X_train,y_train,embedding_matrix[0],nepochs=40)

100%|█████████▉| 1192133/1193514 [01:40<00:00, 15087.15it/s]

Epoch 1/40   -   loss: 0.46641   -   val_loss: 6.96229
saving model...
Epoch 2/40   -   loss: 0.39392   -   val_loss: 5.88400
saving model...
Epoch 3/40   -   loss: 0.33149   -   val_loss: 4.25448
saving model...
Epoch 4/40   -   loss: 0.24637   -   val_loss: 2.83733
saving model...
Epoch 5/40   -   loss: 0.18314   -   val_loss: 2.15942
saving model...
Epoch 6/40   -   loss: 0.15002   -   val_loss: 1.62253
saving model...
Epoch 7/40   -   loss: 0.12933   -   val_loss: 1.46513
saving model...
Epoch 8/40   -   loss: 0.11456   -   val_loss: 1.38749
saving model...
Epoch 9/40   -   loss: 0.10197   -   val_loss: 1.12508
saving model...
Epoch 10/40   -   loss: 0.09648   -   val_loss: 0.99576
saving model...
Epoch 11/40   -   loss: 0.09031   -   val_loss: 0.93475
saving model...
Epoch 12/40   -   loss: 0.08470   -   val_loss: 1.00877
Epoch 13/40   -   loss: 0.08046   -   val_loss: 0.83449
saving model...
Epoch 14/40   -   loss: 0.07608   -   val_loss: 0.77518
saving model...
Epoch 15/40   -  

## Inference

In [None]:
dataloader = DataLoader(TweetDataset(X_test,mode="test"),batch_size=32,shuffle=False)
model = TweetModel(embedding_matrix=embedding_matrix[0]).to(DEVICE)
model.load_state_dict(torch.load("classifier.pt"))
model.eval()
batch_preds = []
predictions=[]
for x,y in dataloader:
  x= x.to(DEVICE)
  with torch.no_grad():
    batch_preds = model(x)

  predictions.append(batch_preds)

predictions = torch.cat(predictions, dim=0).cpu().numpy()




In [None]:
print(roc_auc_score(np.round(predictions).squeeze(),y_test))

0.7679167383336202


In [None]:
from google.colab import files
files.download("tokenizer.pickle")
files.download("classifier.pt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>