In [99]:
import pandas as pd
import torch, torchdata, torchtext
from torch import nn
import time
import os

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 2422
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


cpu


In [100]:
torch.__version__

'1.13.1+cpu'

In [101]:
torchtext.__version__

'0.14.1'

## Load the given dataset

1. Create a variable to your dataset PATH *example:  ./data/*
2. Load the csv files using pandas 



In [102]:
data_yelp = pd.read_csv("sample_submission.csv")

train_data_raw = pd.read_csv("train.csv") 
test_data_raw = pd.read_csv("test.csv")

In [103]:
data_yelp.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [104]:
train_data_raw.shape

(7613, 5)

In [105]:
test_data_raw.shape

(3263, 4)

In [106]:
## Lets analyze the data a little

#print and show how many unique classes are in the target

classes = train_data_raw['target'].unique()
num_classes = len(classes)

In [107]:
assert num_classes > 1

In [108]:
##lets see how many columns are there
#print the columns of the train_data_raw

print(len(test_data_raw)) #write your code here

3263


1. Lets remove the keywords and location columns. We only want to focus on the text and the predictions
2. Lets split some training data to validation dataset

In [109]:
SPLIT_PER = 2 #percentage of split for validation set 2 = 2%
split =  int(len(train_data_raw) * (SPLIT_PER/100))

dropped_train = train_data_raw.drop(columns=['id','keyword', 'location'], axis =1) 
#drop the id, keyowrd and location columns from the train_data_raw

train_data = dropped_train[:-split]
valid_data = dropped_train[-split:]

assert train_data.shape == (len(train_data_raw) - split, 2)
assert valid_data.shape == (split, 2)

In [110]:
print(train_data_raw.shape)
print("After dropping columns and spliting!")
print(train_data.shape, valid_data.shape)
 

(7613, 5)
After dropping columns and spliting!
(7461, 2) (152, 2)


## Lets tokenize the data

In [111]:
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
tokens = tokenizer("We are learning torchtext in AIT!")  #some test
tokens

['We', 'are', 'learning', 'torchtext', 'in', 'AIT', '!']

In [114]:
for i in train_data.values:
    print (i)
    break

['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all' 1]


In [115]:
#loop through the data_iter, 
# Mind that the data_iter in this case is pandas Dataframe
#remove this line and code here

from torchtext.vocab import build_vocab_from_iterator
def yield_tokens(data_iter):
    for text, _ in data_iter.values:
        yield tokenizer(text) 

vocab = build_vocab_from_iterator(yield_tokens(train_data), specials=['<unk>', '<pad>', '<bos>', '<eos>'])
vocab.set_default_index(vocab["<unk>"])

#set_default_index of the vocab to unknown tag

In [116]:
assert len(vocab) == 26442

In [117]:
vocab_dict = vocab.get_stoi()
vocab_dict

{'bells': 17043,
 'Escaping': 12472,
 'charge': 2966,
 "'m": 59,
 'putin': 24713,
 'adoption': 6850,
 'STILL': 6529,
 'WOUNDS': 16371,
 'Mafia': 6212,
 'mass': 200,
 'Lubbock': 6184,
 'Gillibrand': 12913,
 'Scarlet': 15416,
 'depreciations': 17800,
 'Bomb': 1013,
 '@WoundedPigeon': 9949,
 'lvl': 23796,
 'advice': 16695,
 '.': 6,
 'restore': 3119,
 'dominant': 17956,
 'thunder': 475,
 '.@NorwayMFA': 8190,
 'pitch': 3866,
 '@9NewsBrisbane': 8744,
 'Odell': 14460,
 'Quarantine': 761,
 'point': 1142,
 '@ByTorrecilla': 8925,
 'fruit': 3025,
 'Some': 319,
 '&': 38,
 '@charstevens97': 10132,
 'http://t.co/JlzK2HdeTG': 7409,
 'Officer': 1265,
 'LLF': 6136,
 'WYOU': 16392,
 'US': 245,
 '...': 18,
 '@SCynic1': 9707,
 'WIN': 16358,
 'appropriation': 6878,
 'Join': 2411,
 '<bos>': 2,
 'illegal': 2239,
 'threat': 1818,
 'Zhejiang': 16615,
 'en\x89Û': 7222,
 'asswipe': 16889,
 'Reactor': 2454,
 '<unk>': 0,
 'suspect': 749,
 'Andrea': 11042,
 'Minority': 4427,
 'illusion': 23303,
 'http://t.co/0wbEcd

In [118]:
from torchtext.vocab import FastText
fast_vectors = FastText(language='simple') #small for easy training

In [119]:
fast_embedding = fast_vectors.get_vecs_by_tokens(vocab.get_itos()).to(device)
fast_embedding.shape

torch.Size([26442, 300])

In [121]:
fast_embedding = fast_vectors.get_vecs_by_tokens(vocab.get_itos()).to(device)

#since the fasttext  has 300 embedding
assert fast_embedding.shape == (len(vocab), 300)

In [122]:
text_pipeline  = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1 #turn {1, 2, 3, 4} to {0, 1, 2, 3} for pytorch training 

In [123]:
text_pipeline("I love to play football")

[13, 185, 10, 683, 2229]

In [124]:
label_pipeline('1')

0

## To fit the padnas dataframe to DataLoader first we must wrap it as DataSet

In [125]:

from torch.utils.data import Dataset

class PD_DATASET(Dataset):

    def __init__(self, dataframe):
        self.dataframe = dataframe
    
    def __len__(self):
        return len(self.dataframe)
        
    def __getitem__(self, idx):
        return self.dataframe.iloc[idx]

In [None]:
train = 
valid = 
test = 

In [131]:
from torch.utils.data   import DataLoader
from torch.nn.utils.rnn import pad_sequence

pad_idx = vocab['<pad>'] #++<----making sure our embedding layer ignores pad

def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
    #criterion expects float labels
    return torch.tensor(label_list, dtype=torch.int64), pad_sequence(text_list, padding_value=pad_idx, batch_first=True)
## copy the collate_batch function from Professor's code. But it will not work right away
##  mind how the dataset that we use is structured (hint: columns)

In [132]:
batch_size = 64

train_loader = DataLoader(train, batch_size=batch_size,
                              shuffle=True, collate_fn=collate_batch)
valid_loader = DataLoader(valid, batch_size=batch_size,
                              shuffle=True, collate_fn=collate_batch)

NameError: name 'train' is not defined

In [None]:
next(iter(train_loader))

## First lets try CNN

In [None]:
import torch.nn as nn
import torch.nn.functional as F

## Get the Professor's code from  the lab to build the CNN model

class CNN(nn.Module):
    pass #replace this line with the respected code

In [None]:
import torch.nn as nn

class simpleRNN(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.RNN(emb_dim, hid_dim, batch_first=True)
        self.fc  = nn.Linear(hid_dim, output_dim)
        
    def forward(self, text):
        #text = [batch size, seq len]
        embedded = self.embedding(text)
        
        #embedded = [batch size, seq len, embed dim]
        output, hn = self.rnn(embedded)  #if no h0, all zeroes
        
        #output = [batch size, seq len, hidden dim]
        #hidden = [1, batch size, hidden dim]
        
        assert torch.equal(output[:,-1,:], hn.squeeze(0))
        return self.fc(hn.squeeze(0))

In [127]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, input_dim, emb_dim, output_dim, dropout, n_filters, filter_sizes):
        
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx = pad_idx)
        
        self.conv_0 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[0], emb_dim))
        self.conv_1 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[1], emb_dim))
        self.conv_2 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[2], emb_dim))
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, seq len]
        embedded = self.embedding(text)
                
        #embedded = [batch size, seq len, emb dim]
        embedded = embedded.unsqueeze(1)  #<----make text as 1 dimensional data for Conv2d
        #embedded = [batch size, 1, seq len, emb dim]
        
        #squeeze 3 because maxpool1d expect only three dimen tensor
        conved_0 = F.relu(self.conv_0(embedded).squeeze(3)) #conved_0 = [batch_size, n_filters,  seq len - filter_sizes[n] + 1]
        conved_1 = F.relu(self.conv_1(embedded).squeeze(3)) #conved_1 = [batch_size, n_filters,  seq len - filter_sizes[n] + 1]
        conved_2 = F.relu(self.conv_2(embedded).squeeze(3)) #conved_2 = [batch_size, n_filters,  seq len - filter_sizes[n] + 1]
        #conved_n = [batch size, n_filters, seq len - filter_sizes[n] + 1]
        
        #conv0_embedded_squeezed.shape[2] because we want to take max out from the whole weighted sum array
        #we squeeze 2 for linear layer
        
        #F.max_pool1d(input, kernel_size) => [batch_size, n_filters, 1]
        # After squeeze(2) => [batch_size, n_filters]
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim = 1))
        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [None]:
input_dim  = len(vocab)
emb_dim    =  #how many embedding does the fasttext have 
output_dim =  #how many classes do we have
dropout    = 0.5
n_filters  = 100 
filter_sizes = [3, 4, 5]

model = CNN(input_dim, emb_dim, output_dim, dropout, n_filters, filter_sizes).to(device)

In [128]:
batch_size = 3
seq_len    = 50

 

In [129]:
#explicitly initialize weights for better learning
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, (nn.Conv2d, nn.Conv2d)):
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.zeros_(param)
            elif 'weight' in name:
                nn.init.kaiming_normal_(param) 

In [130]:
import torch.optim as optim

lr=1e-3

#training hyperparameters
optimizer = optim.SGD(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss() #combine softmax with cross entropy

NameError: name 'model' is not defined

In [None]:
def accuracy(preds, y):
    
    predicted = torch.max(preds.data, 1)[1]
    batch_corr = (predicted == y).sum()
    acc = batch_corr / len(y)
    
    return acc

In [None]:
def train(model, loader, optimizer, criterion, loader_length):
    #write the code to train the model

In [None]:
def evaluate(model, loader, criterion, loader_length):
    #write the code to evaluate 

In [None]:
train_loader_length = len(list(iter(train_loader)))
val_loader_length   = len(list(iter(valid_loader)))

In [None]:
best_valid_loss = float('inf')
num_epochs      = 5

save_path = f'./models/{model.__class__.__name__}.pt'

train_losses = []
train_accs = []
valid_losses = []
valid_accs = []

for epoch in range(num_epochs):
        #write the code that starts the training, store the training and valid losses and accuracy
    #also print the time it took to train the model

In [None]:
##Plot the training loss and the accuracy

# Lets Try the LSTM model

In [None]:
train = PD_DATASET(train_data)
valid = PD_DATASET(valid_data)

In [None]:
import torch.nn as nn

class LSTM(nn.Module):
    pass #replace this line with the real code

In [None]:
input_dim  = len(vocab)
emb_dim    = #same as above
hidden_dim = #how many hidden dims do you want?
output_dim = #same as above
dropout    = 0.5
num_layers = 2
bidirectional = True 

lstm_model = LSTM(input_dim, emb_dim, hidden_dim, output_dim, num_layers, bidirectional, dropout).to(device)

In [None]:
import torch.optim as optim

lr=1e-3

#training hyperparameters
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss() #

In [None]:
def train(model, loader, optimizer, criterion, loader_length):
    #write the code to train the model 

In [None]:
def evaluate(model, loader, criterion, loader_length):
    #write the code to evaluate the model

In [None]:
train_loader_length = len(list(iter(train_loader)))
val_loader_length   = len(list(iter(valid_loader)))

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
best_valid_loss = float('inf')
num_epochs   = 5

save_path = f'./models/lstm_{model.__class__.__name__}.pt'

train_losses = []
train_accs = []
valid_losses = []
valid_accs = []

for epoch in range(num_epochs):
    #write the code that starts the training, store the training and valid losses and accuracy
    #also print the time it took to train the model

In [None]:
##Plot the losses and accuracy over all epochs

## Conclusion
- Compare the two models on their time and accuracy. Which one do you think did well for the disaster classification task.
- How do you think we get better results in this dataset for classification.

#### Write your answer here