# Bag of Words Meets Bags of Popcorn - Sentiment Analysis

## 1. Importing Necessary Libraries

In [None]:
# utilities
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) 
import re
import os
import pickle
import random

# plotting and visualizing
from PIL import Image
from wordcloud import WordCloud, STOPWORDS
import seaborn as sns


# sklearn

 # classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

 # metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix

 # pipeline
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

## 2. Preprocessing

### Extracting zip files

In [None]:
from zipfile import ZipFile

with ZipFile("../input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip","r") as z:
    z.extractall(".")
    
with ZipFile("../input/word2vec-nlp-tutorial/testData.tsv.zip","r") as z:
    z.extractall(".")
    
print(os.listdir("/kaggle/working/"))

### Loading the train and test data

In [None]:
path_train = "/kaggle/working/labeledTrainData.tsv"
path_test = "/kaggle/working/testData.tsv"

df_train = pd.read_csv(path_train, delimiter = "\t")
df_test = pd.read_csv(path_test, delimiter = "\t")

## 3. Exploratory Data Analysis

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.shape, df_test.shape

In [None]:
df_train.info()

## 4. Data Visualization

### Let's look at the each number of sentiment 

In [None]:
sns.countplot(df_train["sentiment"])

### There are same numbers of target. The dataset is balanced.

## 5. NLP Preprocessing

### Here we create a 0 to 24999 number list.

In [None]:
num_list = list(range(25000))

### Here we chose random number from the list.

In [None]:
i = random.choice(num_list)
df_train["review"].iloc[i]

In [None]:
i = random.choice(num_list)
df_train["review"].iloc[i]

### I observe some html tags in text, so i clean it.

In [None]:
df_train["review"] = df_train["review"].str.replace("<br />","")

### Here we are doing preprocessing, we are cleaning the text. First we define stop_words list, stemmer and cleaning regex expression. After we do the following: first we clean the text with cleaning regex expression and we converted all text to lowercase. After we use strip() function to delete line breaks (\n). Then we define a empty tokens list. We go over text and tokenize the text. And if token not in stop_words list we take the token and append to the tokens list. And if stem = True is given to us in function calling, we stem the token. Finally we join the tokens and make it a sentence.

In [None]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

cleaning = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

def preprocess(text, stem = False):
    # Remove link,user and special characters
    text = re.sub(cleaning, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [None]:
df_train.review = df_train.review.apply(lambda x: preprocess(x, stem = True))

### We've eliminated accents

In [None]:
from unicodedata import normalize
df_train["review"] = df_train["review"].apply(lambda text: normalize("NFKD", str(text)).encode("ascii", "ignore").decode("utf-8", "ignore"))

### We are deciding x and y and we split it to x_train and x_test

In [None]:
x = df_train.review

y = df_train.sentiment

print(x.shape, y.shape) 

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42)

## 6. Predicting and Modeling

### Evaluating Function

In [None]:
def evaluate(model, vect):
    model.fit(x_train_dtm, y_train)
    y_pred = model.predict(x_test_dtm)
    print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
    print("F1 Score: ", metrics.f1_score(y_test, y_pred))
    print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))

### Vectorizing and transforming x_train and x_test

In [None]:
tfidf_vect = TfidfVectorizer(ngram_range = (1,2), min_df = 2)

x_train_dtm = tfidf_vect.fit_transform(x_train)
x_test_dtm = tfidf_vect.transform(x_test)

### Defining the Models and Evaluating

In [None]:
MNB = MultinomialNB()
LSVC = LinearSVC()
LR = LogisticRegression(C = 2, max_iter = 1000)
GNB = GaussianNB()
BNB = BernoulliNB()
KNC = KNeighborsClassifier()
SVC = SVC()
DTC = DecisionTreeClassifier()
RFC = RandomForestClassifier()
GBC = GradientBoostingClassifier()

print("Multinomial Classfier: \n")
evaluate(MNB, tfidf_vect)
print("\n")
print("Linear SVC Classfier: \n")
evaluate(LSVC, tfidf_vect)
print("\n")
print("Logistic Regression Classfier: \n")
evaluate(LR, tfidf_vect)
print("\n")
print("Bernoulli Classfier: \n")
evaluate(BNB, tfidf_vect)
print("\n")
print("K Neighbors Classfier: \n")
evaluate(KNC, tfidf_vect)
print("\n")
print("SVC Classfier: \n")
evaluate(SVC, tfidf_vect)
print("\n")
print("Decision Tree Classfier: \n")
evaluate(DTC, tfidf_vect)
print("\n")
print("Random Forest Classfier: \n")
evaluate(RFC, tfidf_vect)
print("\n")
print("Gradient Boosting Classfier: \n")
evaluate(GBC, tfidf_vect)
print("\n")

### We have achieved to high accuracy as 90% in Linear SVC Classifier.

### Creating Wordcloud

In [None]:
def create_wordcloud(text):
    stopwords = set(STOPWORDS)
    wc = WordCloud(max_words = 25000, stopwords = stopwords)
    wc.generate(str(text))
    wc.to_file("wordcloud.png")
    print("Word Cloud saved successfully")
    path = "wordcloud.png"
    display(Image.open(path))

In [None]:
create_wordcloud(df_train[df_train["sentiment"] == 1].review) # positive

In [None]:
create_wordcloud(df_train[df_train["sentiment"] == 0].review) # negative

## 7. Creating Pipeline

In [None]:
model = Pipeline([("vectorizer", tfidf_vect), ("classifier", LSVC)])
model.fit(x_train, y_train)
pred = model.predict(x_test)
confusion_matrix(pred, y_test)

In [None]:
def predict(sentence):
    if model.predict(sentence) == 0:
        print("negative")
    else:
        print("positive")

In [None]:
ex = ["It is so nice"]
predict(ex)

In [None]:
ex = ["This is so bad"]
predict(ex)

### Saving the best Model

In [None]:
with open("model_pickle", "wb") as f:
    pickle.dump(LSVC, f)

### Loading the Model


In [None]:
with open("model_pickle", "rb") as f:
    loaded_model = pickle.load(f)

In [None]:
loaded_model.predict(x_test_dtm)

#### As you can see, we could save the model and load the model and use it