In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import nltk
from nltk.stem import PorterStemmer

In [None]:
dataset = pd.read_csv('../input/imdb-review-dataset/imdb_master.csv',encoding = "ISO-8859-1")

In [None]:
dataset = dataset[dataset.label != 'unsup']

In [None]:
test = dataset[dataset.type == 'test']

In [None]:
train = dataset[dataset.type == 'train']

In [None]:
train.shape

In [None]:
test.shape

In [None]:
import string
import re
import os
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
stopwords_english = stopwords.words('english')


In [None]:
def process_review(review):
    stemmer = PorterStemmer()
    # remove stock market tickers like $GE
    review = re.sub(r'\$\w*', '', review)
    # remove hashtags
    # only removing the hash # sign from the word
    review = re.sub(r'#', '', review)
    review = review.lower()
    # tokenize 
    tokens = word_tokenize(review)
    ### START CODE HERE ###
    clean = []
    for word in tokens:
        if (word not in stopwords_english and # remove stopwords
            word not in string.punctuation): # remove punctuation
            stem_word = stemmer.stem(word) # stemming word
            clean.append(stem_word)
    ### END CODE HERE ###
    return clean

In [None]:
train_list = list(train.review)
test_list = list(test.review)
train_label = list(train.label)
test_label = list(test.label)

In [None]:
example = train_list[0]
example

In [None]:
process_review(example)

In [None]:
from tqdm import tqdm

# First Approach : Freqs dictionnary + LogReg

In [None]:
def build_freqs(reviews, ys):
    """Build frequencies.
    Input:
        reviews: a list of review
        ys: an m x 1 array with the sentiment label of each review
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, review in tqdm(zip(yslist, reviews)):
        for word in process_review(review):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs


In [None]:
set(train_label)

In [None]:
train_y = [1 if train_label[i] == 'pos' else 0 for i in range(len(train_label))]
test_y =  [1 if test_label[i] == 'pos' else 0 for i in range(len(test_label))]

In [None]:
# create frequency dictionary
freqs = build_freqs(train_list, train_y)

# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

In [None]:
def extract_features(review, freqs):

    word_l = process_review(review)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 2)) 
    
    
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,0] += freqs.get((word,1),0)
        
        # increment the word count for the negative label 0
        x[0,1] +=freqs.get((word,0),0)
        
    ### END CODE HERE ###
    assert(x.shape == (1, 2))
    return x

In [None]:
# Check your function

# test 1
# test on training data
tmp1 = extract_features(train_list[0], freqs)
print(tmp1)

In [None]:
train_X = np.zeros((len(train_list),2))
for i in tqdm(range(len(train_list))):
    train_X[i] = extract_features(train_list[i],freqs)
    

In [None]:
test_X = np.zeros((len(test_list),2))
for i in tqdm(range(len(test_list))):
    test_X[i] = extract_features(test_list[i],freqs)

In [None]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(train_X,train_y)


In [None]:
pred = clf.predict(test_X)

In [None]:
(pred == test_y).mean()

# Second Approach : TF IDF + LogReg

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer  
tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))  
train_X = tfidfconverter.fit_transform(train_list).toarray()

In [None]:
test_X = tfidfconverter.transform(test_list).toarray()

In [None]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(train_X,train_y)

In [None]:
pred = clf.predict(test_X)

In [None]:
(pred == test_y).mean()

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_y,pred)