# EDA Movie Review Dataset Rotten Tomatoes

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Imports

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_colwidth', 0)

### Loading Data

In [None]:
train_df = pd.read_csv('../input/movie-dataset-rotten-tomatoes/train.tsv', sep='\t')
test_df = pd.read_csv('../input/movie-dataset-rotten-tomatoes/test.tsv', sep='\t')

In [None]:
train_df.head()

In [None]:
train_df.shape

In [None]:
train_df.count()

### No Missing Values

In [None]:
len(train_df.SentenceId.unique())

In [None]:
len(train_df.Phrase.unique()), len(train_df.PhraseId.unique())

In [None]:
train_df.Sentiment.unique()

```
The sentiment labels are:

0 - negative
1 - somewhat negative
2 - neutral
3 - somewhat positive
4 - positive
```

## Let's check the sentiment distribution in the train set

In [None]:
def id_to_label(id):
    id_to_label_dict = {
        0: 'negative',
        1: 'somewhat negative',
        2: 'neutral',
        3: 'somewhat positive',
        4: 'positive'
    }
    return id_to_label_dict[id]

In [None]:
train_df['sentiment_label'] = list(map(id_to_label, train_df['Sentiment']))

In [None]:
train_df.head()

In [None]:
sentiment_distribution_counts = train_df.groupby('sentiment_label')['sentiment_label'].count()

In [None]:
plt.figure(figsize=(16,9))
sns.countplot(train_df['sentiment_label'])

In [None]:
temp_df = train_df.drop_duplicates(subset=['SentenceId'])

In [None]:
plt.figure(figsize=(16,9))
plt.title('Review Sentiments on Unique Sentences')
sns.countplot(temp_df['sentiment_label'])

### Let's remove punctuation marks as well
### There are multiple ways to do it 
### I am using RegexpTokenizer from nltk
### (Refer) https://www.kite.com/python/answers/how-to-remove-all-punctuation-marks-with-nltk-in-python

In [None]:
import nltk
def remove_punctuation(phrase):
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    new_words = tokenizer.tokenize(phrase)
    return ' '.join(new_words)

In [None]:
train_df['clean_phrase'] = list(map(remove_punctuation, train_df['Phrase']))

In [None]:
train_df.head()

### There are words like `a, am, and, any... ` etc, such words don't carry any significant importance as such 
### Such words are called `Stop Words` and generally they are omitted from the analysis

### Let's find and remove such stop words from our dataset

```
If you have not downloaded already, use this

import nltk
nltk.download('stopwords')
```

In [None]:
from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))

In [None]:
print(len(english_stops))

### nltk provides us with 179 unique stopwords, we will use this set to filter our `CleanPhrase`

In [None]:
def remove_stop_words(phrase):
    tokenized = phrase.split(' ')
    filtered = list()
    for word in tokenized:
        if word.lower() not in english_stops:
            filtered.append(word)
    return ' '.join(filtered)

In [None]:
train_df['phrase_without_stopwords'] = list(map(remove_stop_words, train_df['clean_phrase']))

In [None]:
train_df.head()

### Adding Phrase Length as another feature

In [None]:
train_df['phrase_length'] = train_df['phrase_without_stopwords'].apply(lambda x: len(x.split(' ')))

In [None]:
train_df.head()

### Let's analyze 1 word phrases

In [None]:
one_word_review = train_df[train_df.phrase_length == 1]

In [None]:
one_word_review.head()

### Some data cleaning (Removing rows with only whitespace in `phrase_without_stopwords`)

In [None]:
nan_value = float("NaN")
train_df.replace("", nan_value, inplace=True)
train_df.replace(" ", nan_value, inplace=True)
train_df.dropna(subset = ["phrase_without_stopwords"], inplace=True)

In [None]:
one_word_review = train_df[train_df.phrase_length == 1]

In [None]:
one_word_review.head()

### At this point we can visualize some most common words using the wordcloud

In [None]:
from wordcloud import WordCloud

In [None]:
def show_wordcloud(data, stopwords, title = None):
    """
    Overridden the function from `https://www.kaggle.com/parth05rohilla/sentiment-analysis-using-7-different-techniques`
    """
    wordcloud = WordCloud(
        background_color='black',
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        scale=3,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))
    fig = plt.figure(1, figsize=(15, 15))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

In [None]:
show_wordcloud(train_df['phrase_without_stopwords'], english_stops, 'Most Common Words from the whole corpus')

In [None]:
show_wordcloud(train_df[train_df['Sentiment'] == 0]['phrase_without_stopwords'], english_stops, 'Negative Reviews')

In [None]:
show_wordcloud(train_df[train_df['Sentiment'] == 1]['phrase_without_stopwords'], english_stops, 'Somewhat Negative Reviews')

In [None]:
show_wordcloud(train_df[train_df['Sentiment'] == 2]['phrase_without_stopwords'], english_stops, 'Neutral Reviews')

In [None]:
show_wordcloud(train_df[train_df['Sentiment'] == 3]['phrase_without_stopwords'], english_stops, 'Somewhat Positive Reviews')

In [None]:
show_wordcloud(train_df[train_df['Sentiment'] == 4]['phrase_without_stopwords'], english_stops, 'Positive Reviews')

### Some more look into one_word_review (Phrases)

### let's see which words are mostly used for all emotions

In [None]:
grouped_by_sentiment = one_word_review.groupby('sentiment_label')

In [None]:
from collections import defaultdict
sentiment_word_count_dict = dict()

for sentiment, data in grouped_by_sentiment:
    sentiment_word_count_dict[sentiment] = dict()
    for d in data.values:
        if d[-2] not in sentiment_word_count_dict[sentiment].keys():
            sentiment_word_count_dict[sentiment][d[-2]] = 1
        else:
            sentiment_word_count_dict[sentiment][d[-2]] +=1

In [None]:
train_df['sentiment_label'].unique()

In [None]:
negative_sentiment_dict = sentiment_word_count_dict['negative']
somewhat_negative_sentiment_dict = sentiment_word_count_dict['somewhat negative']
neutral_sentiment_dict = sentiment_word_count_dict['neutral']
somewhat_positive_sentiment_dict = sentiment_word_count_dict['somewhat positive']
positive_sentiment_dict = sentiment_word_count_dict['positive']

In [None]:
import operator
sorted_negative_sentiment_words = sorted(negative_sentiment_dict.items(), key=operator.itemgetter(1), reverse=True)[:10]
sorted_somewhat_negative_sentiment_words = sorted(somewhat_negative_sentiment_dict.items(), key=operator.itemgetter(1), reverse=True)[:10]
sorted_neutral_sentiment_words = sorted(neutral_sentiment_dict.items(), key=operator.itemgetter(1), reverse=True)[:10]
sorted_somewhat_positive_sentiment_words = sorted(somewhat_positive_sentiment_dict.items(), key=operator.itemgetter(1), reverse=True)[:10]
sorted_positive_sentiment_words = sorted(positive_sentiment_dict.items(), key=operator.itemgetter(1), reverse=True)[:10]

### Plotting Top 10 negative sentiment words

In [None]:
plt.figure(figsize=(32,9))
plt.xticks(rotation=90)
plt.title("Top 10 one word Negative Reviews")
sns.barplot(x=[i[0] for i in sorted_negative_sentiment_words], y=[i[1] for i in sorted_negative_sentiment_words], )


In [None]:
plt.figure(figsize=(32,9))
plt.xticks(rotation=90)
plt.title("Top 10 one word Somewhat Negative Reviews")
sns.barplot(x=[i[0] for i in sorted_somewhat_negative_sentiment_words], y=[i[1] for i in sorted_somewhat_negative_sentiment_words], )

In [None]:
plt.figure(figsize=(32,9))
plt.xticks(rotation=90)
plt.title("Top 10 one word Neutral Reviews")
sns.barplot(x=[i[0] for i in sorted_neutral_sentiment_words], y=[i[1] for i in sorted_neutral_sentiment_words], )

In [None]:
plt.figure(figsize=(32,9))
plt.xticks(rotation=90)
plt.title("Top 10 one word Somewhat Positive Reviews")
sns.barplot(x=[i[0] for i in sorted_somewhat_positive_sentiment_words], y=[i[1] for i in sorted_somewhat_positive_sentiment_words], )

In [None]:
plt.figure(figsize=(32,9))
plt.xticks(rotation=90)
plt.title("Top 10 one word Positive Reviews")
sns.barplot(x=[i[0] for i in sorted_positive_sentiment_words], y=[i[1] for i in sorted_positive_sentiment_words], )

### Similarly we can analyze the data even more to understand it even further