<a href="https://colab.research.google.com/github/pythonuzgit/elmurodov/blob/master/Natural_language_processing_with_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from zipfile import ZipFile
file_name = "/content/disaster-tweets.zip"
with ZipFile(file_name, 'r') as zip:
  zip.extractall()
  print('Done')

Done


Import standard numerical packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time


import torch
import torch.nn as nn
from torchtext import data
import torch.nn as nn
from torchtext.legacy import data

read training data

In [None]:
train = pd.read_csv('/content/tweets.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0


In [None]:
train.shape

(11370, 5)

We are only interested in the text and target columns. So we drop the rest

In [None]:
train.drop(columns = ['id', 'keyword', 'location'], inplace = True)

Next we clean and modify the texts, so that the classification algorithm does not get confused with irrelevant information

In [None]:
def normalise_text (text):
    text = text.str.lower() # lowercase
    text = text.str.replace(r"\#","") # replaces hashtags
    text = text.str.replace(r"http\S+","URL")  # remove URL addresses
    text = text.str.replace(r"@","")
    text = text.str.replace(r"[^A-Za-z0-9()!?\'\`\"]", " ")
    text = text.str.replace("\s{2,}", " ")
    return text



In [None]:
train['text'] = normalise_text(train['text'])

Let us look at the cleaned text once

In [None]:
train['text'].head()

0    communal violence in bhainsa telangana "stones...
1    telangana section 144 has been imposed in bhai...
2          arsonist sets cars ablaze at dealership URL
3      arsonist sets cars ablaze at dealership URL URL
4    "lord jesus your love brings freedom and pardo...
Name: text, dtype: object

Split the data into training and validation sets

In [None]:
train_df, valid_df = train_test_split(train)
train_df.head()

Unnamed: 0,text,target
7103,only thing keeping me going is my 14 year old ...,0
7277,whereas once again many innocents have been k...,0
4253,what if boomers got to know that they can use ...,0
3028,the cameraman being a panic gay the moment tae...,0
9670,if there is one place deserving of a suicide b...,0


In [None]:
valid_df.head()

Unnamed: 0,text,target
8490,my suggestion is continue your oxford and focu...,0
3297,we try our best each week it s a deluge of poe...,0
5910,venice runs dry gondolas stranded in mud just ...,1
72,tickets on sale!!!!!!!!!! aftershock URL nowpl...,0
3716,hi there i'm very sorry to hear this we are ru...,1


The following will help make the results reproducible later

In [None]:
SEED = 42

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

Create Field and LabelField 

In [None]:
TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)


Next create a DataFrameDataset class

In [None]:
class DataFrameDataset(data.Dataset):

  def __init__(self, df, fields, is_test=False, **kwargs):

    examples = []
    for i, row in df.iterrows():
      label = row.target if not is_test else None
      text = row.text
      examples.append(data.Example.fromlist([text, label], fields))

      super().__init__(examples, fields, **kwargs)


  @staticmethod
  def sort_key(ex):
    return len(ex.text)




  @classmethod
  def splits(cls, fields, train_df, val_df=None,  **kwargs):
    train_data, val_data = (None, None)
    data_field = fields


    if train_df is not None:
      train_data = cls(train_df.copy(), data_field, **kwargs)
    if val_df is not None:
      val_data = cls(val_df.copy(), data_field, **kwargs)




    return tuple(d for d in (train_data, val_data) if d is not None)

Use the split method of DataFrameDataset

In [None]:
fields = [('text', TEXT), ('label', LABEL)]
train_ds, val_ds = DataFrameDataset.splits(fields, train_df = train_df, val_df = valid_df)

Lets look at a random example

In [None]:
print(vars(train_ds[15]))

{'text': ['that', "'s", 'pretty', 'much', 'what', 'my', 'ex', 'said', 'to', 'me', 'too', 'yelled', 'it', 'and', 'screamed', 'it', 'in', 'URL'], 'label': 0}


Check the type

In [None]:
print(type(train_ds[15]))

<class 'torchtext.legacy.data.example.Example'>


We will now build the vocabulary using only the training dataset

In [None]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_ds, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = 'glove.6B.200d',
                 unk_init = torch.Tensor.zero_)

In [None]:
LABEL.build_vocab(train_ds)

Build the iterators

In [None]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_ds, val_ds),
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

LSTM architecture

In [None]:
num_epochs = 25
learning_rate = 0.001

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 200
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] # padding

Setting up the LSTM model

In [None]:
class LSTM_net(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
               n_layers, bidirectional, dropout, pad_idx):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)

    self.rnn = nn.LSTM(embedding_dim,
                       hidden_dim, 
                       num_layers = n_layers,
                       bidirectional = bidirectional,
                       dropout = dropout)
    self.fc1 = nn.Linear(hidden_dim*2, hidden_dim)
    self.fc2 = nn.Linear(hidden_dim, 1)
    self.dropout = nn.Dropout(dropout)


  def forward(self, text, text_lengths):
    embedded = self.embedding(text)

    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)

    packed_output, (hidden, cell) = self.rnn(packed_embedded)

    hidden = self.dropout(torch.cat((hidden[-2, :, :],
                                     hidden[-1, :, :]), dim = 1))
    
    output = self.fc1(hidden)
    output = self.dropout(self.fc2(output))



    return output

Creating instance of our LSTM_net class



In [None]:
model = LSTM_net(INPUT_DIM,
                 EMBEDDING_DIM,
                 HIDDEN_DIM,
                 OUTPUT_DIM,
                 N_LAYERS,
                 BIDIRECTIONAL,
                 DROPOUT,
                 PAD_IDX)

Loading the pretrained vectors into the embedding matrix

In [None]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)

torch.Size([18338, 200])


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0715,  0.0935,  0.0237,  ...,  0.3362,  0.0306,  0.2558],
        ...,
        [ 0.0934, -0.0589, -0.1489,  ..., -0.3619, -0.1673,  0.1019],
        [ 0.0380,  0.2330, -0.0145,  ..., -0.0146,  0.3024, -0.1939],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [None]:
#  to initiaise padded to zeros
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0715,  0.0935,  0.0237,  ...,  0.3362,  0.0306,  0.2558],
        ...,
        [ 0.0934, -0.0589, -0.1489,  ..., -0.3619, -0.1673,  0.1019],
        [ 0.0380,  0.2330, -0.0145,  ..., -0.0146,  0.3024, -0.1939],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])


In [None]:
model.to(device) #CNN to GPU


# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
def binary_accuracy(preds, y):
  rounded_preds = torch.round(torch.sigmoid(preds))
  correct = (rounded_preds == y).float()
  acc = correct.sum()/ len(correct)

  return acc

Training the Model

In [None]:
def train(model, iterator):
  epoch_loss = 0
  epoch_acc = 0
  model.train()

  for batch in iterator:
    text, text_lengths = batch.text
    optimizer.zero_grad()
    predictions = model(text, text_lengths).squeeze(1)
    loss = criterion(predictions, batch.label)
    acc = binary_accuracy(predictions, batch.label)

    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator)  



In [None]:
def evaluate(model, iterator):
  epoch_acc = 0
  model.eval()

  with torch.no_grad():
    for batch in iterator:
      text, text_lengths = batch.text
      predictions = model(text, text_lengths).squeeze(1)
      acc = binary_accuracy(predictions, batch.label)

      epoch_acc += acc.item()

  return epoch_acc /len(iterator)  



In [None]:
t = time.time()

loss = []
acc = []
val_acc = []

for epoch in range(num_epochs):
  train_loss, train_acc = train(model, train_iterator)
  valid_acc = evaluate(model, valid_iterator)

  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\t Val. Acc: {valid_acc*100:.2f}%')

  loss.append(train_loss)
  acc.append(train_acc)
  val_acc.append(valid_acc)





print(f'time:{time.time()-t:.3f}')  

	Train Loss: 0.299 | Train Acc: 90.60%
	 Val. Acc: 89.71%
	Train Loss: 0.261 | Train Acc: 91.58%
	 Val. Acc: 89.81%
	Train Loss: 0.209 | Train Acc: 93.96%
	 Val. Acc: 89.00%
	Train Loss: 0.174 | Train Acc: 95.04%
	 Val. Acc: 89.71%
	Train Loss: 0.158 | Train Acc: 95.62%
	 Val. Acc: 89.06%
	Train Loss: 0.158 | Train Acc: 95.63%
	 Val. Acc: 89.09%
	Train Loss: 0.147 | Train Acc: 96.30%
	 Val. Acc: 89.06%
	Train Loss: 0.154 | Train Acc: 95.84%
	 Val. Acc: 88.59%
	Train Loss: 0.143 | Train Acc: 96.32%
	 Val. Acc: 88.45%
	Train Loss: 0.140 | Train Acc: 96.23%
	 Val. Acc: 88.86%
	Train Loss: 0.139 | Train Acc: 96.19%
	 Val. Acc: 89.06%
	Train Loss: 0.138 | Train Acc: 96.61%
	 Val. Acc: 89.17%
	Train Loss: 0.143 | Train Acc: 95.86%
	 Val. Acc: 88.21%
	Train Loss: 0.145 | Train Acc: 96.11%
	 Val. Acc: 88.76%
	Train Loss: 0.139 | Train Acc: 96.07%
	 Val. Acc: 88.46%
	Train Loss: 0.136 | Train Acc: 96.60%
	 Val. Acc: 88.72%
	Train Loss: 0.136 | Train Acc: 96.30%
	 Val. Acc: 88.38%
	Train Loss: 0