## Import and installation section

In [None]:
import re
import os
import numpy as np 
import pandas as pd 
import tensorflow as tf

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# install transformers
!pip install transformers

In [None]:
# import the model and tokenizer
from transformers import (DistilBertTokenizerFast, 
                         TFDistilBertForSequenceClassification)    

## Detect and initialze tpu

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    
except:
    strategy = tf.distribute.get_strategy()
    
print('Number of replicas in sync: ', strategy.num_replicas_in_sync)

In [None]:
# dataframe display settings
pd.set_option('display.max_colwidth', None)

## Read train and test data into pandas dataframe

In [None]:
train_data = pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip', sep = '\t')
test_data = pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip', sep = '\t')

train_data.head()

In [None]:
# check the shape of the train data
train_data.shape

In [None]:
# display the head of test data
test_data.head()

In [None]:
# check the shape of the test data
test_data.shape

In [None]:
# check the number of examples in each class
train_data.Sentiment.value_counts(normalize = True).plot(kind = 'bar', figsize = (10, 6), xlabel = 'Sentiments');

- Dataset is highly imbalanced

## Cleaning of text

In [None]:
# set of stop words in english
stop_words = set(stopwords.words('english'))

neg = ["aren't", "didn't", "doesn't", "hadn't",  "haven't", "isn't", 'no', 'not', "shouldn't", "wasn't", "weren't", "wouldn't"]
stop_words.difference_update(neg)

In [None]:
# this function will clean the text
def text_cleaning(text):
    if text:
        text = ' '.join(text.split('.'))
        text = re.sub('\/', ' ', text)
        text = re.sub(r'\\', ' ', text)
        text = re.sub(r'((http)\S+)', '', text)
        text = re.sub(r'\s+', ' ', re.sub('[^A-Za-z]', ' ', text.strip().lower())).strip()
        text = re.sub(r'\W+', ' ', text.strip().lower()).strip()
        text = [word for word in text.split() if word not in stop_words]
        return text
    return []

In [None]:
# clean train and test dataframes
train_data['Phrase'] = train_data['Phrase'].apply(lambda x: ' '.join(text_cleaning(x)))
test_data['Phrase'] = test_data['Phrase'].apply(lambda x: ' '.join(text_cleaning(x)))

In [None]:
# drop duplicates from train data
train_data.drop_duplicates(subset = ['Phrase'], inplace = True)
train_data.head(8)

## Calculate length of the phrase

In [None]:
# add length column to train data
train_data['length'] = train_data['Phrase'].apply(lambda x: len(x.split()))

# add length column to test data
test_data['length'] = test_data['Phrase'].apply(lambda x: len(x.split()))

# filter the phrases from the test data with zero length
len_zero_data = test_data[test_data['length'] == 0]
len_zero_data.shape

## Train and validation split

In [None]:
# select phrases with length > 1 and split data into train and validation set
x_train, x_val, y_train, y_val = train_test_split(train_data[train_data['length'] > 1]['Phrase'], 
                                                  train_data[train_data['length'] > 1]['Sentiment'], 
                                                  test_size = 0.2, 
                                                  stratify = train_data[train_data['length'] > 1]['Sentiment'],
                                                  random_state = 42)

print(f'Shape of x_train: {x_train.shape}\nShape of y_train: {y_train.shape}')
print(f'Shape of x_val: {x_val.shape}\nShape of y_val: {y_val.shape}')

## Create tokenizer

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# encode the training and validation data
train_encodings = tokenizer(x_train.tolist(), truncation = True, padding = True)
val_encodings = tokenizer(x_val.tolist(), truncation = True, padding = True)

## Create training and validation datasets for training

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings), y_train.values)).shuffle(10000).batch(32).repeat()

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings), y_val.values)).shuffle(10000).batch(32)

## Create model

In [None]:
with strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained(
      'distilbert-base-uncased', num_labels = 5)

    optimizer = tf.keras.optimizers.Adam(learning_rate = 5e-5)
    model.compile(optimizer = optimizer, loss = model.compute_loss, metrics = 
                ['accuracy'])

In [None]:
# fit the model
model.fit(train_dataset, epochs = 3, batch_size = 32, steps_per_epoch = len(x_train) // 32,
          validation_data = val_dataset, validation_steps = len(x_val) // 32)