## uHack Sentiments 2.0: Decode Code Words

The challenge here is to analyze and deep dive into the natural language text (reviews) and bucket them based on their topics of discussion. Furthermore, analyzing the overall sentiment will also help the business to make tangible decisions.

The data set provided to you has a mix of customer reviews for products across categories and retailers. We would like you to model on the data to bucket the future reviews in their respective topics (Note: A review can talk about multiple topics) and Overall polarity (positive/negative sentiment)

Data:

Topics (Components, Delivery and Customer Support, Design and Aesthetics, Dimensions, Features, Functionality, Installation, Material, Price, Quality and Usability)

Polarity (Positive/Negative)

Note: The target variables are all encoded in the train dataset for convenience. Please submit the test results in the similar encoded fashion for us to evaluate your results.

### import libraries

In [None]:
import numpy as np
import pandas as pd
import os,gc,re
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

### data

In [None]:
train  = pd.read_csv("../input/ugam-hack-data/train.csv",index_col='Id')
pd.set_option('display.max_colwidth', None)
train.head(2)

In [None]:
test  = pd.read_csv("../input/ugam-hack-data/test.csv",index_col='Id')
test.head(1)

In [None]:
train.iloc[:,1:].sum()

In [None]:
categories = list(train.columns[1:].values)
sns.set(font_scale = 1.2)
plt.figure(figsize=(25,8))
ax= sns.barplot(x=categories, y=train.iloc[:,1:].sum().values)
plt.title("Reviews in each category", fontsize=24)
plt.ylabel('Number of Reviews', fontsize=18)
plt.xlabel('Comment Type ', fontsize=18)
#adding the text labels
rects = ax.patches
labels = train.iloc[:,1:].sum().values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 10, label, ha='center', va='bottom', fontsize=11)
    plt.show()

In [None]:
gc.collect()

### Counting the number of comments having multiple labels

In [None]:
rowSums =train.iloc[:,1:].sum(axis=1)
multiLabel_counts = rowSums.value_counts()
multiLabel_counts = multiLabel_counts.iloc[1:]
sns.set(font_scale = 1.2)
plt.figure(figsize=(15,4))
ax = sns.barplot(x=multiLabel_counts.index, y=multiLabel_counts.values)
plt.title("Comments having multiple labels ")
plt.ylabel('Number of comments', fontsize=14)
plt.xlabel('Number of labels', fontsize=15)
#adding the text labels
rects = ax.patches
labels = multiLabel_counts.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
plt.show()

In [None]:
del ax,rects, labels
gc.collect()

### Data processing

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import sys
import warnings

In [None]:
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext

def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

In [None]:
train['Review'] = train['Review'].str.lower()
train['Review'] = train['Review'].apply(cleanHtml)
train['Review'] = train['Review'].apply(cleanPunc)
train['Review'] = train['Review'].apply(keepAlpha)

In [None]:
test['Review'] = test['Review'].str.lower()
test['Review'] = test['Review'].apply(cleanHtml)
test['Review'] = test['Review'].apply(cleanPunc)
test['Review'] = test['Review'].apply(keepAlpha)

In [None]:
def clean_whitespace(text):
    text = text.strip()
    return text

train['Review'] = train['Review'].apply(clean_whitespace)
test['Review'] = test['Review'].apply(clean_whitespace)

In [None]:
train['Ratings']= train[train.columns[1:]].values.tolist()
train.head(1)

In [None]:
data = train[['Review','Ratings']].copy()
data.head()

In [None]:
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

In [None]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 96
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 9
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.Review = dataframe.Review
        self.targets = self.data.Ratings
        self.max_len = max_len

    def __len__(self):
        return len(self.Review)

    def __getitem__(self, index):
        Review = str(self.Review[index])
        Review = " ".join(Review.split())

        inputs = self.tokenizer.encode_plus(
            Review,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset= data.sample(frac=train_size,random_state=14)
test_dataset=  data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

In [None]:
training_set[0]

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 12)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids,return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()

In [None]:
torch.cuda.empty_cache()

In [None]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

In [None]:
model.to(device)

In [None]:
# loss function
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
# optimizer
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

# fine tuning

In [None]:
def train(epoch):
    model.train()
    for _,training_data in enumerate(training_loader, 0):
        ids = training_data['ids'].to(device, dtype = torch.long)
        mask = training_data['mask'].to(device, dtype = torch.long)
        token_type_ids = training_data['token_type_ids'].to(device, dtype = torch.long)
        targets = training_data['targets'].to(device, dtype = torch.float)
        
        outputs = model(ids, mask, token_type_ids)
        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

In [None]:
torch.save(model.state_dict(),'../output')

In [None]:
class SubmissionDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.Review = dataframe.Review
        self.max_len = max_len

    def __len__(self):
        return len(self.Review)

    def __getitem__(self, index):
        Review = str(self.Review[index])
        Review = " ".join(Review.split())

        inputs = self.tokenizer.encode_plus(
            Review,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
        }

In [None]:
sub_data = SubmissionDataset(test,tokenizer, MAX_LEN)
sub_data[0]

In [None]:
torch.cuda.empty_cache()

In [None]:
model.to(device)

In [None]:
sub_params = {'batch_size': 8,
                'shuffle': True,
                'num_workers': 0
                }

sub_loader = DataLoader(sub_data, **sub_params)

In [None]:
for _,data in enumerate(sub_loader,0):
    ids = data['ids'].to(device, dtype = torch.long)
    mask = data['mask'].to(device, dtype = torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
outputs = model(ids, mask, token_type_ids)

In [None]:
gc.collect()

In [None]:
outputs.shape