In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Sentiment analysis using Text Classification

In [None]:
imdb = pd.read_csv('/kaggle/input/trainings/IMDB Dataset.csv')
print(imdb.shape)
imdb.head()

In [None]:
imdb['sentiment'].value_counts()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import nltk
common_stopwords = nltk.corpus.stopwords.words('english')
custom_stopwords = ['<br />', 'br', 
                   'film', 'movie', 'one', 'would']
stopwords = common_stopwords + custom_stopwords
docs_string_negative = ' '.join(imdb[imdb['sentiment'] == 'negative']['review'].str.lower())
wc = WordCloud(background_color='white', stopwords=stopwords).generate(docs_string_negative)
plt.imshow(wc)

In [None]:
docs_string_positive = ' '.join(imdb[imdb['sentiment'] == 'positive']['review'].str.lower())
wc = WordCloud(background_color='white', stopwords=stopwords).generate(docs_string_negative)
plt.imshow(wc)

### Text preprocessing

In [None]:
imdb.isna().sum()

In [None]:
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.porter import PorterStemmer

stemmer = PorterStemmer()

docs = imdb['review'].str.lower().str.replace('<br />', '').str.replace('[^a-z\s]', '')
docs = docs.apply(remove_stopwords)
docs = stemmer.stem_documents(docs)
docs = pd.Series(docs)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

train_x, test_x, train_y, test_y = train_test_split(docs, imdb['sentiment'],
                                                   test_size=0.2, random_state=1)

vectorizer = CountVectorizer(min_df=10).fit(train_x)

In [None]:
vocab = vectorizer.get_feature_names()
train_dtm = vectorizer.transform(train_x)
test_dtm = vectorizer.transform(test_x)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
model = MultinomialNB().fit(train_dtm, train_y)
test_y_pred = model.predict(test_dtm)
print('Accuracy: ', accuracy_score(test_y, test_y_pred))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer().fit(train_x)
vocab = vectorizer.get_feature_names()
train_dtm = vectorizer.transform(train_x)
test_dtm = vectorizer.transform(test_x)

In [None]:
model = MultinomialNB().fit(train_dtm, train_y)
test_y_pred = model.predict(test_dtm)
print('Accuracy: ', accuracy_score(test_y, test_y_pred))

## Sentiment analysis using rule based methods

In [None]:
docs = imdb['review'].str.replace('<br />', '')
docs.head()


In [None]:
nltk.download('vader_lexicon')

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
review = 'i like tea'
analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores(review)

In [None]:
total_score = 0.5
compound_score = total_score / np.sqrt(np.square(total_score) + 15)
compound_score

In [None]:
analyzer.polarity_scores('i hate coffee i love tea')

In [None]:
print(analyzer.polarity_scores('i like tea'))
print(analyzer.polarity_scores('i LIKE tea'))
print(analyzer.polarity_scores('i like tea!!!'))
print(analyzer.polarity_scores('i like tea :)'))
print(analyzer.polarity_scores('i very much like tea :)'))