In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dfori = pd.read_csv('/kaggle/input/imdb-review-dataset/imdb_master.csv')
df = dfori.copy()
df = df.drop(['Unnamed: 0', 'type', 'file'], axis = 1)
df

We just want to focus on the review and its original label. What are the labels?

In [None]:
df['label'].value_counts()

For simplicity, let's just remove the unsup.

In [None]:
df = df[df['label'] != 'unsup'] # see only pos/neg
df['label'].value_counts() # updated df

# Preprocessing using CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words = 'english')

dtm = cv.fit_transform(df['review'])

## Justify X and Y variables

Because this is semi supervised, we have dependent and independent variables.

In [None]:
x = dtm # this is the dtm
y = df['label']

# Train and Predict
Let's try using Random Forest Classifier.

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 1)

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

# Train
rfc.fit(x_train, y_train)

# Predict
y_pred = rfc.predict(x_test)

# Evaluation

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report
print(confusion_matrix(y_test, y_pred))
print(" ")
print(accuracy_score(y_test, y_pred))
print(" ")
print(classification_report(y_test, y_pred))

# Conclusion

Using Random Forest Classification, we can infer that the sentiment analysis using text classification results in about 85% accuracy.

# Sentiment Analysis using NLTK SIA Compound Score

The above method is for supervised learning. Now, we'll try the unsupervised learning by NLTK VADER method for Sentiment Intensity Analyzer.

In [None]:
# Importing the NLTK Library, Vader Lexicon, Sentiment Intensity Analyzer (SIA)
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Create new column for the SIA Score
df['score'] = df['review'].apply(lambda x: sia.polarity_scores(x))

# Create new column for the final SIA Compound Score
df['compound'] = df['score'].apply(lambda x: x['compound'])

# Display the new df
df

* Now, compound score indicates whether a comment tends to be positive (>0) or negative(<0). 
* We can see from the above dataframe that in some rows, the compound score does not reflect the 'true' label.
* However, SIA has been more than capable of at least guessing the last five columns.