In [1]:
from google.colab import files
files.upload() #upload kaggle.json

!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
kaggle.json


In [2]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [3]:
!kaggle datasets download -d kazanova/sentiment140
#!kaggle datasets download -d takuok/glove840b300dtxt
#!unzip /content/glove840b300dtxt.zip

Downloading sentiment140.zip to /content
 94% 76.0M/80.9M [00:01<00:00, 72.5MB/s]
100% 80.9M/80.9M [00:01<00:00, 64.3MB/s]


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torchtext.vocab import GloVe
import torch
from torchtext import data
from torch.utils.data import DataLoader,Dataset
from nltk.stem import WordNetLemmatizer,PorterStemmer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
import nltk
import pickle
from torch import nn
import torch.nn.functional as F
nltk.download('all')
nltk.download('punkt')
stop=stopwords.words("english")

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

In [5]:
DATASET_ENCODING = "ISO-8859-1"
SEED = 2020
EMBED_SIZE = 300
MAX_FE = 1000
MAX_LEN = 80
torch.manual_seed(SEED)
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.deterministic = True

## Reading the Dataset

In [6]:
def preprocess_text(df):
    '''Function to preprocess and create corpus'''
    new_corpus=[]
    vocab={}
    for text in tqdm(df["tweet"]):
        words=[w for w in word_tokenize(re.sub(r'^@[a-zA-Z]+','',text)) ]
        for word in words:
          try:  
            vocab[word]+=1
          except:
            vocab[word]=1

        new_corpus.append(words)
    return new_corpus,vocab



In [7]:
def tokenizer(corpus,mode='train'):

  model_save_name = 'tokenizer.pickle'
  path = F"{model_save_name}" 
     

  if mode=="train":
    tokenizer_obj=Tokenizer()
    tokenizer_obj.fit_on_texts(corpus)
    word_index=tokenizer_obj.word_index

    with open(path,'wb') as tok:
      pickle.dump(tokenizer_obj,tok,protocol=pickle.HIGHEST_PROTOCOL)

  else:
 
    word_index=None
    with open('tokenizer.pickle','rb') as tok:
       tokenizer= pickle.load(tok)

  
  sequences=tokenizer_obj.texts_to_sequences(corpus)
  tweet_pad=pad_sequences(sequences,
                            maxlen=MAX_LEN,
                            truncating='post',
                            padding='post')
    
  return tweet_pad,word_index 
    

In [8]:
def load_glove():
    
    embedding_dict={}
    with open('/content/glove.840B.300d.txt') as f:
        for line in tqdm(f):
            values=line.split()
            word=values[0]
            try:
              vectors=np.asarray(values[1:],'float32')
              embedding_dict[word]=vectors
            except:
              continue
    f.close()
    
    return embedding_dict




In [9]:
from collections import defaultdict
def prepare_matrix(word_index):
    embedding_dict = GloVe("twitter.27B",dim=200)
    iiv= defaultdict(int)
    oov= defaultdict(int)
    num_words = len(word_index)
    embedding_matrix = np.zeros((num_words+1, 200))

    for word, i in tqdm(word_index.items()):
        if i > num_words:
            continue

        
        emb_vec = embedding_dict[word]
        if not torch.equal(emb_vec,torch.zeros((200),dtype=torch.float)):
          embedding_matrix[i] = emb_vec
          iiv[word]=vocab[word]

        elif torch.equal(embedding_dict[word.lower()],torch.zeros((200),dtype=torch.float)):
          emb_vec = embedding_dict[word.lower()]
          embedding_matrix[i] = emb_vec
          iiv[word]=vocab[word]

        elif torch.equal(embedding_dict[word.title()],torch.zeros((200),dtype=torch.float)):
          emb_vec = embedding_dict[word.title()]
          embedding_matrix[i] = emb_vec
          iiv[word]=vocab[word]

        else:
          oov[word] = vocab[word]

    return embedding_matrix,iiv,oov

#### Model taken from https://www.kaggle.com/artgor/text-modelling-in-pytorch

In [10]:
class TweetModel(nn.Module):

    def __init__(self, embedding_matrix, lstm_hidden_size=256, gru_hidden_size=64):

        super(TweetModel,self).__init__()
        self.embedding = nn.Embedding(*embedding_matrix.shape)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = nn.Dropout2d(0.1)

        self.lstm = nn.LSTM(embedding_matrix.shape[1], lstm_hidden_size, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(lstm_hidden_size*2 , gru_hidden_size, bidirectional=True, batch_first=True)
        self.dropout2 = nn.Dropout(0.25)
        self.Linear1 = nn.Linear(gru_hidden_size*4,64)
        self.Linear2 = nn.Linear(64,1)
        
    def apply_spatial_dropout(self, h_embedding):
        h_embedding = h_embedding.transpose(1, 2).unsqueeze(2)
        h_embedding = self.embedding_dropout(h_embedding).squeeze(2).transpose(1, 2)
        return h_embedding
    
    def flatten_parameters(self):
        self.lstm.flatten_parameters()
        self.lstm2.flatten_parameters()

    def forward(self, x):
        batch_size = x.size(0)
        h_embedding = self.embedding(x)
        h_embedding = self.apply_spatial_dropout(h_embedding)

        h_lstm, _ = self.lstm(h_embedding)
        h_lstm, _ = self.lstm2(h_lstm)

        avg_pool = torch.mean(h_lstm, 1)
        max_pool, _ = torch.max(h_lstm, 1)
        concat = self.Linear1(torch.cat((avg_pool, max_pool), 1))
        #out = self.dropout2(concat)
        out = torch.sigmoid(self.Linear2(concat))
        return out

## Datagenerators

In [11]:
class TweetDataset(Dataset):

  def __init__(self,train,targets=None,mode="train"):

    self.train = train
    self.mode=mode
    self.targets = targets
    
  def __len__(self):

    return len(self.train)

  def __getitem__(self,idx):

    x_train_fold =  torch.tensor(self.train[idx],dtype=torch.long).to(DEVICE)
    if self.mode=='train':
        y_train_fold = torch.tensor(self.targets[idx],dtype=torch.float32).to(DEVICE)
        return x_train_fold, y_train_fold
    else:
        return x_train_fold,0



## Train Model

In [12]:
def train_model(train,target,embedding_matrix,nepochs=5,batch_size=32,test_split=0.15):

      model_save_name = 'classifier.pt'
      path = F"{model_save_name}" 
     
      X_train,X_test,y_train,y_test = train_test_split(train,target,test_size = test_split)

      train_data = TweetDataset(X_train,y_train)
      test_data = TweetDataset(X_test,y_test)

      dataloaders= {"train":DataLoader(train_data,batch_size=batch_size,shuffle=True),
                    "valid":DataLoader(test_data,batch_size=batch_size,shuffle=False)}

      model = TweetModel(embedding_matrix).to(DEVICE)
      loss_fn = torch.nn.BCELoss().cuda()
      optimizer = torch.optim.Adam(model.parameters(),lr=1e-3)
      scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, eps=1e-4, verbose=True)

      best_loss = {'train':np.inf,
                  "valid":np.inf}

      for epoch in range(nepochs):

        epoch_loss = {"train":0.00,
                      "valid":0.00}

        for phase in ['train','valid']:

              if phase=="train":
                model = model.train()
              else:
                model= model.eval()

              running_loss = 0.00

              for i,(x,y) in enumerate(dataloaders['train']):

                  optimizer.zero_grad()
                  with torch.set_grad_enabled(phase=="train"):

                    predict = model(x).squeeze()
                    loss = loss_fn(predict,y)



                    if phase == "train":

                      loss.backward()
                      optimizer.step()

                  running_loss+= loss.item()/len(dataloaders[phase])

                  epoch_loss[phase]=running_loss

        
        print("Epoch {}/{}   -   loss: {:5.5f}   -   val_loss: {:5.5f}".format(epoch+1, nepochs, epoch_loss['train'], epoch_loss['valid']))
        scheduler.step(epoch_loss['valid'])
        if epoch_loss['valid'] < best_loss['valid']:

          print("saving model...")
          best_loss = epoch_loss
          torch.save(model.state_dict(),path,_use_new_zipfile_serialization=False)


    










## Main

In [13]:
from sklearn.utils import shuffle
import re

In [14]:
df = pd.read_csv("/content/sentiment140.zip",encoding=DATASET_ENCODING)
df= df.iloc[:,[0,-1]]
df.columns = ['sentiment','tweet']
df = pd.concat([df.query("sentiment==0").sample(110000),df.query("sentiment==4").sample(110000)])
df.sentiment = df.sentiment.map({0:0,4:1})
df =  shuffle(df).reset_index(drop=True)

In [15]:
corpus,vocab = preprocess_text(df)

100%|██████████| 220000/220000 [00:51<00:00, 4239.29it/s]


In [16]:
len(vocab)

163806

In [17]:
tweet_pad,word_index = tokenizer(corpus)

In [18]:

#embedding_dict = load_glove()


In [19]:
embedding_matrix = prepare_matrix(vocab)

.vector_cache/glove.twitter.27B.zip: 1.52GB [11:44, 2.16MB/s]                           
100%|█████████▉| 1192560/1193514 [01:56<00:00, 10023.62it/s]
  0%|          | 0/163806 [00:00<?, ?it/s][A
  0%|          | 96/163806 [00:00<02:50, 959.92it/s][A
  1%|▏         | 2402/163806 [00:00<01:59, 1347.28it/s][A
  3%|▎         | 4284/163806 [00:00<01:25, 1867.39it/s][A
  4%|▍         | 6536/163806 [00:00<01:01, 2576.11it/s][A
  5%|▌         | 8735/163806 [00:00<00:44, 3504.19it/s][A
  7%|▋         | 11004/163806 [00:00<00:32, 4695.15it/s][A
  8%|▊         | 13002/163806 [00:00<00:24, 6093.60it/s][A
  9%|▉         | 15003/163806 [00:00<00:19, 7699.97it/s][A
 11%|█         | 17270/163806 [00:00<00:15, 9602.09it/s][A
 12%|█▏        | 19511/163806 [00:01<00:12, 11588.28it/s][A
 13%|█▎        | 21743/163806 [00:01<00:10, 13541.53it/s][A
 15%|█▍        | 23875/163806 [00:01<00:09, 15201.02it/s][A
 16%|█▌        | 26006/163806 [00:01<00:08, 16101.54it/s][A
 17%|█▋        | 28214/16380

In [20]:
import operator
oov = sorted(embedding_matrix[2].items(),key=lambda x : x[1], reverse=True)
len(embedding_matrix[1])/len(vocab)

1.0

## Train

In [21]:
X_train,X_test,y_train,y_test = train_test_split(tweet_pad,df.sentiment.values,test_size=0.2)

In [None]:
train_model(X_train,y_train,embedding_matrix[0],nepochs=40,batch_size=32)

Epoch 1/40   -   loss: 0.56591   -   val_loss: 2.98936
saving model...
Epoch 2/40   -   loss: 0.52452   -   val_loss: 2.79747
saving model...
Epoch 3/40   -   loss: 0.50404   -   val_loss: 2.66345
saving model...
Epoch 4/40   -   loss: 0.48239   -   val_loss: 2.52628
saving model...
Epoch 5/40   -   loss: 0.45448   -   val_loss: 2.26931
saving model...
Epoch 6/40   -   loss: 0.42000   -   val_loss: 2.00590
saving model...
Epoch 7/40   -   loss: 0.38325   -   val_loss: 1.72894
saving model...
Epoch 8/40   -   loss: 0.34362   -   val_loss: 1.46533
saving model...
Epoch 9/40   -   loss: 0.30995   -   val_loss: 1.25364
saving model...
Epoch 10/40   -   loss: 0.28222   -   val_loss: 1.12216
saving model...
Epoch 11/40   -   loss: 0.25851   -   val_loss: 1.08625
saving model...
Epoch 12/40   -   loss: 0.24352   -   val_loss: 0.94372
saving model...
Epoch 13/40   -   loss: 0.22760   -   val_loss: 0.87189
saving model...
Epoch 14/40   -   loss: 0.22058   -   val_loss: 0.83972
saving model...
E

## Inference

In [None]:
DEVICE

In [None]:
dataloader = DataLoader(TweetDataset(X_test,mode="test"),batch_size=32,shuffle=False)
model = TweetModel(embedding_matrix=embedding_matrix[0]).to(DEVICE)
model.load_state_dict(torch.load("classifier.pt"))
model.eval()
batch_preds = []
predictions=[]
for x,y in dataloader:
  x= x.to(DEVICE)
  with torch.no_grad():
    batch_preds = model(x)

  predictions.append(batch_preds)

predictions = torch.cat(predictions, dim=0).cpu().numpy()




In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
print(roc_auc_score(np.round(predictions).squeeze(),y_test))

## Best way to understand LSTM

In [None]:
x = torch.tensor([[1,2, 12,34, 56,78, 90,80],
                 [12,45, 99,67, 6,23, 77,82],
                 [3,24, 6,99, 12,56, 21,22]])

In [None]:
 
 emb = nn.Embedding(100,100)
 model = nn.LSTM(100,hidden_size=5,num_layers=1, batch_first=True)

In [None]:
out1 = emb(x)

In [None]:
out1.shape

In [None]:
out2,(h1,c1) = model(out1)

In [None]:
out2.shape

In [None]:
mean = torch.mean(out2,1)
max,_ = torch.max(out2,1)

In [None]:
out4 = torch.cat((mean,max),1)

In [None]:
torch.nn.functional.sigmoid(nn.Linear(10,1)(out4))

In [None]:
mean.shape

In [None]:
out2 = torch.cat((mean,max),1)

In [None]:
out2.shape

In [None]:
nn.Linear(16,1)(out)

In [None]:
h1.shape

In [None]:
c1.shape

In [None]:
torch.max

In [None]:
import logging

In [None]:
logging.error("sdsa")

In [None]:
from torch.utils.model_zoo import load_url

In [None]:
model= load_url(url="https://drive.google.com/file/d/1rAvsmDZo6l1l4fpZPYKAM1D8ur3wFrYR/view?usp=sharing")

In [None]:
torch.hub.load_state_dict_from_url('https://drive.google.com/file/d/1rAvsmDZo6l1l4fpZPYKAM1D8ur3wFrYR/view?usp=sharing')

In [None]:
!kaggle datasets download -d mlg-ulb/creditcardfraud

In [None]:
import pandas as pd
df= pd.read_csv("/content/creditcardfraud.zip")

In [None]:
df = pd.concat([df.query("Class==1"),df.query("Class==0").sample(7000)]).reset_index(drop=True)

In [None]:
df.to_csv("/content/gdrive/My Drive/credit_card.csv",index=False )

In [None]:
df.Class.value_counts()