In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

In [None]:
data = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv", nrows=2000)
print(data.shape)
data.head(10)

In [None]:
data.isnull().sum()

In [None]:
# print(data['sentiment'].value_counts())

fig = plt.figure(figsize = (10,6))
sns.countplot(data=data, x='sentiment', palette = ["green","red"])

In [None]:
data['sentiment'] = data['sentiment'].apply(lambda x: 1 if x=='positive' else 0)

# Data Cleaning

In [None]:
# data[data['review'].str.contains(r'<.*?', regex=True)]

# data[data['review'].str.contains('http')]
# a=data[data['review'].str.contains(r'\\W', regex=True)].reset_index()
# print(a['review'][0])

# print("<: ", data['review_clean'].str.contains('<').sum())
# print(">: ", data['review_clean'].str.contains('>').sum())
# print("/: ", data['review_clean'].str.contains('/').sum())
# print("http: ", data['review_clean'].str.contains('http').sum())
# print("<br />: ", data['review_clean'].str.contains('<br />').sum())

In [None]:
!pip install contractions

In [None]:
import contractions
import re
import string
from nltk.corpus import wordnet
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))

from nltk.stem.wordnet import WordNetLemmatizer
wnl = WordNetLemmatizer()

def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def remove_punct(text):
    message=[]
    
    for word in text:
        message_not_punc = []
        
        if word not in stop_words:
            for char in word:
                if char not in string.punctuation:
                    message_not_punc.append(char)

            text_nopunct = "".join(message_not_punc)
            
            if text_nopunct!="":
                message.append(text_nopunct)
                
    return message

def preprocessing(text):
    text = text.lower().strip()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = text.replace("http: //video.google.com/ videoplay?docid=-3001837218936089620&q =innerviews+ jamie+foxx&hl=en ","")
    text = re.sub('\n', '', text)
    text = re.sub('<.*?>', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = contractions.fix(text)
    text = re.sub("\\W"," ",text) # remove special chars
    text=word_tokenize(text)
    
    message = []
    
    for word in text:
        message.append(wnl.lemmatize(word, get_wordnet_pos(word)))
    
    message = remove_punct(message)
    message = " ".join(message)
    
    return message


data['review_clean'] = data['review'].apply(lambda x: preprocessing(x))

data.head(20)

# Create features

## Length

In [None]:
data['length'] = data['review'].apply(lambda x: len(x) - x.count(" "))

sns.displot(data=data, x="length", hue='sentiment', col="sentiment", bins=20, multiple="dodge", aspect=1.5)

## Count of punctuations marks

In [None]:
def count_punct(text):
    
    count=0
    for char in text:
        if char in string.punctuation:
            count+=1
    
    return count

data['count_punct'] = data['review'].apply(lambda x: count_punct(x))

sns.displot(data=data, x="count_punct", hue='sentiment', col="sentiment", bins=20, multiple="dodge", aspect=1.5)

## Count uppercase letters

In [None]:
def count_uppercase(text):  
    count=0
    for char in text:
        if char.isupper():
            count+=1
    
    return count

data['count_uppercase'] = data['review'].apply(lambda x: count_uppercase(x))

sns.displot(data=data, x="count_uppercase", hue='sentiment', col="sentiment", bins=20, multiple="dodge", aspect=1.5)

## Count exclamation marks

In [None]:
data['exclamation_marks'] = data['review'].apply(lambda x: x.count("!"))

## Count words

In [None]:
data['count_words'] = data['review'].apply(lambda x: len(x.split()))

sns.displot(data=data, x="count_words", hue='sentiment', col="sentiment", bins=20, multiple="dodge", aspect=1.5)

## Average word length

In [None]:
def avg_word_len(text):  
    text=text.split()
    for i, val in enumerate(text):
        text[i]=len(val)
    
    return round(np.mean(text),3)

data['avg_word_len'] = data['review'].apply(lambda x: avg_word_len(x))

sns.displot(data=data, x="avg_word_len", hue='sentiment', col="sentiment", bins=20, multiple="dodge", aspect=1.5)

## VADER

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

data['scores'] = data['review'].apply(lambda review: sid.polarity_scores(review))
data['VADER_negative']  = data['scores'].apply(lambda score_dict: score_dict['neg'])
data['VADER_neutral']  = data['scores'].apply(lambda score_dict: score_dict['neu'])
data['VADER_positive']  = data['scores'].apply(lambda score_dict: score_dict['pos'])
data['VADER_compound']  = data['scores'].apply(lambda score_dict: score_dict['compound'])

In [None]:
sns.displot(data=data, x="VADER_negative", hue='sentiment', col="sentiment", bins=20, multiple="dodge", aspect=1.5)
sns.displot(data=data, x="VADER_neutral", hue='sentiment', col="sentiment", bins=20, multiple="dodge", aspect=1.5)
sns.displot(data=data, x="VADER_positive", hue='sentiment', col="sentiment", bins=20, multiple="dodge", aspect=1.5)
sns.displot(data=data, x="VADER_compound", hue='sentiment', col="sentiment", bins=20, multiple="dodge", aspect=1.5)

In [None]:
def get_corpus(text):
    words = []
    
    for i in text:
        for j in i.split():
            words.append(j.strip())
            
    return words

corpus = get_corpus(data["review_clean"])


from collections import Counter
counter = Counter(corpus)
most_common = counter.most_common(30)
most_common = dict(most_common)

df = pd.DataFrame.from_dict(most_common, orient='index').reset_index()
df = df.rename(columns={'index':'Word', 0:'Count'})

fig = plt.figure(figsize = (10,6))
sns.barplot(data=df, x="Count", y="Word", palette="Blues_r_d", orient='h')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_text_ngrams(corpus, n, g):
    vec = CountVectorizer(ngram_range=(g, g)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    
    return words_freq[:n]

most_common_bi = get_top_text_ngrams(data["review_clean"], 20, 2)
most_common_bi = dict(most_common_bi)

df = pd.DataFrame.from_dict(most_common_bi, orient='index').reset_index()
df = df.rename(columns={'index':'Word', 0:'Count'})

fig = plt.figure(figsize = (10,6))
sns.barplot(data=df, x="Count", y="Word", palette="Blues_r_d", orient='h')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(data["review_clean"])
print(X_counts.shape)

In [None]:
X_counts_df = pd.DataFrame(X_counts.toarray())
X_counts_df.columns = count_vect.get_feature_names()
X_counts_df

In [None]:
X_features = pd.concat([data['VADER_negative'], data['VADER_neutral'], data['VADER_positive'], data['VADER_compound'], data['count_words'], data['avg_word_len'], data['length'], data['count_punct'], data['count_uppercase'], data['exclamation_marks'], pd.DataFrame(X_counts.toarray())], axis=1)
X_features.head()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score

rf = RandomForestClassifier(n_jobs=-1)
k_fold = KFold(n_splits=5)
cross_val_score(rf, X_features, data['sentiment'], cv=k_fold, scoring='accuracy', n_jobs=-1).mean()

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_features, data['sentiment'], test_size=0.2)

rf = RandomForestClassifier(n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, average='binary')
print(f'Precision: {round(precision, 3)} \nRecall: {round(recall, 3)} \nAccuracy: {round((y_pred==y_test).sum() / len(y_pred),3)}')

In [None]:
feature_importance = rf_model.feature_importances_[:15]
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

fig = plt.figure(figsize=(17, 6))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(X_train.columns)[sorted_idx])
plt.title('Feature Importance')