###### Import the Data

Our first step is going to be to import data. We have three dataset options to choose from in the kaggle link - IMDb, Amazon or Yelp. We are going to select IMDb.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

data= pd.read_csv('/content/imdb_labelled.txt', names=['Reviews','Sentiment'], delimiter = '\t')
data.info()

This shows that we have 748 rows in our dataset.
To view the data we can call the data object, which would print all the rows

In [None]:
data

To make it unbiased, we would shuffle the data before splitting.

In [None]:
# make a copy of the data
shuffle_data = data.copy(deep=True)
# shuffle data with sample().
# frac = 1 is entire dataset, random_state=1 for reproducible data
# and reset_index() to reset the index
shuffle_data = shuffle_data.sample(frac=1, random_state=1).reset_index()
shuffle_data.head(10)

 We can now proceed to split the data.

In [None]:
train_data = shuffle_data[0:639]
dev_data = shuffle_data[639:714]
test_data = shuffle_data[714:]

train_data.info()
dev_data.info()
test_data.info()

###### Making a word frequency list and dictionary

We are going to make a list of all the words and a dictionary which would contain the frequency of the given word.

To make our our job easier, we would remove all the special characters from data

In [None]:
data =data.replace(to_replace = "[^a-zA-Z0-9]",value= " " ,regex=True)
shuffle_data =shuffle_data.replace(to_replace = "[^a-zA-Z0-9]",value= " " ,regex=True)
train_data =train_data.replace(to_replace = "[^a-zA-Z0-9]",value= " " ,regex=True)
dev_data =dev_data.replace(to_replace = "[^a-zA-Z0-9]",value= " " ,regex=True)
test_data =test_data.replace(to_replace = "[^a-zA-Z0-9]",value= " " ,regex=True)

Then we move on to making the list and the dictionary by splitting the string and counting the words

In [None]:
word_list=[]
words_freq={}
word_list_pos=[]
words_freq_pos={}
word_list_neg=[]
words_freq_neg={}

for sentence,sentence_sentiment in zip(shuffle_data['Reviews'],shuffle_data['Sentiment']):
  for word in sentence.split(' '):
    word_lower = word.lower()

    if word_lower not in word_list:
      word_list.append(word_lower)
      words_freq[word_lower] = 1

      if sentence_sentiment == 1:
        word_list_pos.append(word_lower)
        words_freq_pos[word_lower] = 1
      else:
        word_list_neg.append(word_lower)
        words_freq_neg[word_lower] = 1

    else:
      words_freq[word_lower] = words_freq[word_lower] + 1
      
      if sentence_sentiment == 1:
        if word_lower not in word_list_pos:
          word_list_pos.append(word_lower)
          words_freq_pos[word_lower] = 1
        else:
          words_freq_pos[word_lower] = words_freq_pos[word_lower] + 1
      else:
        if word_lower not in word_list_neg:
          word_list_neg.append(word_lower)
          words_freq_neg[word_lower] = 1
        else:
          words_freq_neg[word_lower] = words_freq_neg[word_lower] + 1
            
print('word list:',word_list)
print('word freq:',words_freq)

print('word list pos:',word_list_pos)
print('word freq pos:',words_freq_pos)

print('word list neg:',word_list_neg)
print('word freq neg:',words_freq_neg)

We can improve the list by removing rare words, i.e. words with frequency < 5.

In [None]:
word_list=[]

word_list=[k for k,v in words_freq.items() if v >= 5]
word_list.sort()

words_freq={k:v for k,v in words_freq.items() if v >= 5}

word_list_pos=[item for item in word_list_pos if item in words_freq]
word_list_pos.sort()
words_freq_pos={k:v for k,v in words_freq_pos.items() if k in words_freq}

word_list_neg=[item for item in word_list_pos if item in words_freq]
word_list_neg.sort()
words_freq_neg={k:v for k,v in words_freq_pos.items() if k in words_freq}

###### Calculate the probability of the word "the"

We have to calculate the probability of the word "the" as the part of our exercise.