In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

!pip install autoviml

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

pd.set_option('display.max_colwidth', -1)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from autoviml.Auto_NLP import Auto_NLP

In [None]:
destination_folder = './'

In [None]:
import re
import string
class TextProcessing:
    def __init__(self,text):
        self.text = text
    
    def remove_punctuation(self,text):
        return "".join([i for i in text if i not in string.punctuation])
    def remove_url(self,text):
        url = re.compile(r'https?://\S+|www\.\S+')
        text = url.sub(r'',text)
        url = re.compile(r'http?://\S+|www\.\S+')
        return url.sub(r'',text)
    def remove_emoji(self,text):
        emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)
    def remove_html(self,text):
        html=re.compile(r'<.*?>')
        return html.sub(r'',text)
    def clean(self):
#         self.text = self.text.apply(lambda x: re.sub("s+"," ", x) )
#         text = self.text.apply(lambda x: self.remove_punctuation(x)  )
        self.text = self.text.apply(lambda x: x.lower()  )
#         self.text = self.text.apply(lambda x: self.remove_url(x)  )
#         self.text = self.text.apply(lambda x: self.remove_emoji(x)  )
#         self.text = self.text.apply(lambda x: self.remove_html(x)  )
        return self.text
        

In [None]:
# Libraries

import matplotlib.pyplot as plt
import pandas as pd
import torch

# Preliminaries

from torchtext.data import Field, TabularDataset, BucketIterator, Iterator

# Models

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns


In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv").fillna('')
train['text_keyword'] = train[["keyword","text"]].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
ttp = TextProcessing(train['text_keyword'])
train['text_keyword'] = ttp.clean()

In [None]:
train.head()

In [None]:
train.keyword.value_counts()

In [None]:
train.isna().sum() /train.shape[0]

In [None]:
# train[train.target==1].groupby(['keyword'])['keyword'].count().plot.bar()
train[train.target==1]['keyword'].value_counts()

In [None]:
train[train.target==0]['keyword'].value_counts()

In [None]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(4, 4), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
common_words = get_top_n_bigram(train[train.target == 1]['text'], 20)
df1 = pd.DataFrame(common_words, columns = ['text' , 'count'])
df1.groupby('text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams for disaster text')

In [None]:
common_words = get_top_n_bigram(train[train.target == 0]['text'], 20)
df1 = pd.DataFrame(common_words, columns = ['text' , 'count'])
df1.groupby('text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigramsv for non disaster text')

In [None]:
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv").fillna('')
test['text_keyword'] = test[["keyword","text"]].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
tetp = TextProcessing(test['text_keyword'])
test['text_keyword'] = tetp.clean()
#Dummy target Variable
test['target'] = 0
test[['text_keyword','target']].to_csv('test_bert.csv',index=False)

In [None]:
train_bert = pd.DataFrame()
valid_bert = pd.DataFrame()
def create_dataset(X,Y,df):
    df['text_keyword'] = X
    df['target'] = Y    

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split( train['text_keyword'],train['target'], test_size=0.33, random_state=42, stratify=train['target'])
create_dataset(X_train,y_train,train_bert)
create_dataset(X_valid,y_valid,valid_bert)
train_bert.to_csv("train_bert.csv",index=False)
valid_bert.to_csv("valid_bert.csv",index=False)

**Building a benchmark model using AutoNLP**

In [None]:
train_x, test_x, predictor, predicted= Auto_NLP('text_keyword', train_bert, test,'target',score_type="balanced_accuracy",
                                            top_num_features=200,modeltype="Classification",verbose=2,build_model=True)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Model parameter
MAX_SEQ_LEN = 128
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

# Fields

label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True,
                   fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)
train_fields = [ ('text_keyword', text_field),('target', label_field)]
test_fields = [ ('text_keyword', text_field)]

# TabularDataset

train, valid,test = TabularDataset.splits(path='./', train='train_bert.csv', validation='valid_bert.csv',
                                          test='test_bert.csv', format='CSV', fields=train_fields, skip_header=True)


# Iterators

train_iter = BucketIterator(train, batch_size=16, sort_key=lambda x: len(x.text_keyword),
                            device=device, train=True, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=16, sort_key=lambda x: len(x.text_keyword),
                            device=device, train=True, sort=True, sort_within_batch=True)
test_iter = Iterator(test, batch_size=16, device=device, train=False, shuffle=False, sort=False)


**Model**

In [None]:
class BERT(nn.Module):

    def __init__(self):
        super(BERT, self).__init__()

        options_name = "bert-base-uncased"
        self.encoder = BertForSequenceClassification.from_pretrained(options_name)

    def forward(self, text, label):
        loss, text_fea = self.encoder(text, labels=label)[:2]

        return loss, text_fea

Training

In [None]:
# Save and Load Functions

def save_checkpoint(save_path, model, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

def load_checkpoint(load_path, model):
    
    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']


def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):

    if save_path == None:
        return
    
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_metrics(load_path):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']

In [None]:
# Training Function

def train(model,
          optimizer,
          criterion = nn.BCELoss(),
          train_loader = train_iter,
          valid_loader = valid_iter,
          num_epochs = 5,
          eval_every = len(train_iter) // 2,
          file_path = destination_folder,
          best_valid_loss = float("Inf")):
    
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    # training loop
    model.train()
    for epoch in range(num_epochs):
        for (text_keyword, target), _ in train_loader:
            target = target.type(torch.LongTensor)           
            target = target.to(device)
            text_keyword = text_keyword.type(torch.LongTensor)  
            text_keyword = text_keyword.to(device)
            output = model(text_keyword, target)
            loss, _ = output

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():                    

                    # validation loop
                    for (text_keyword,target), _ in valid_loader:
                        target = target.type(torch.LongTensor)           
                        target = target.to(device)
                        text_keyword = text_keyword.type(torch.LongTensor)  
                        text_keyword = text_keyword.to(device)
                        output = model(text_keyword, target)
                        loss, _ = output
                        
                        valid_running_loss += loss.item()

                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)

                # resetting running values
                running_loss = 0.0                
                valid_running_loss = 0.0
                model.train()

                # print progress
                print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                              average_train_loss, average_valid_loss))
                
                # checkpoint
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
                    save_checkpoint(file_path + '/' + 'model.pt', model, best_valid_loss)
                    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    
    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('Finished Training!')

In [None]:
model = BERT().to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)

train(model=model, optimizer=optimizer)

In [None]:
train_loss_list, valid_loss_list, global_steps_list = load_metrics(destination_folder + '/metrics.pt')
plt.plot(global_steps_list, train_loss_list, label='Train')
plt.plot(global_steps_list, valid_loss_list, label='Valid')
plt.xlabel('Global Steps')
plt.ylabel('Loss')
plt.legend()
plt.show() 

**Selecting the best model after training step**

In [None]:
best_model = BERT().to(device)

load_checkpoint(destination_folder + '/model.pt', best_model)

Evaluate

In [None]:
y_eval = []
y_true_eval = []
def evaluate(model, test_loader):  

    model.eval()
    with torch.no_grad():
        for (text_keyword,target), _ in test_loader:

                target = target.type(torch.LongTensor)           
                target = target.to(device)
                text_keyword = text_keyword.type(torch.LongTensor)  
                text_keyword = text_keyword.to(device)
                output = model(text_keyword, target)

                _, output = output
                y_eval.extend(torch.argmax(output, 1).tolist())
                y_true_eval.extend(target.tolist())
    
    print('Classification Report:')
    print(classification_report(y_true_eval, y_eval, labels=[0,1], digits=4))
    
    cm = confusion_matrix(y_true_eval, y_eval, labels=[0,1])
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, cmap='Blues', fmt="d")

    ax.set_title('Confusion Matrix')

    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('True Labels')

    ax.xaxis.set_ticklabels(['Not Disaster', 'REAL'])
    ax.yaxis.set_ticklabels(['Not Disaster', 'REAL'])

In [None]:
evaluate(best_model, valid_iter)

In [None]:
valid_bert['y_eval'] = y_eval

valid_bert[valid_bert.target ==1]

In [None]:
y_pred = []
def predict(model, test_loader):    

    model.eval()
    with torch.no_grad():
        for (text_keyword,target), _ in test_loader:
                target = target.type(torch.LongTensor)           
                target = target.to(device)
                text_keyword = text_keyword.type(torch.LongTensor)  
                text_keyword = text_keyword.to(device)
                output = model(text_keyword, target)

                _, output = output
                y_pred.extend(torch.argmax(output, 1).tolist())
               

In [None]:
predict(best_model, test_iter)
test_final = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
test_final['target'] = y_pred
test_final[['id','target']].to_csv('submission.csv',index=False)

In [None]:
# test_final['autoNlpPrediction'] = predicted
# test_final[(test_final.target==1) & (test_final.autoNlpPrediction==0)]
test_final['target'] = predicted
test_final[['id','target']].to_csv('submission1.csv',index=False)