# Sentiment Analysis - IMDB Movie Reviews

In [9]:
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 11 2019

@author: Daniel Mayo
"""

import numpy as np    # increases efficiency of matrix operations
import pandas as pd   # reads in data files of mixed data types
import re             # regular expressions to find/replace strings
import nltk           # natural language toolkit
from nltk.corpus import stopwords   # get list of stopwords to filter
                                    # out non-sentiment filler words
from sklearn.model_selection import train_test_split

stop_words = set(stopwords.words('english')) # make the stopword list a set
                                             # to increase speed of comparisons

df = pd.read_csv("C:/Users/mrahman1s/Documents/MML 5320/Sentiment Analysis with Random Split/trainingData5000.txt", header=0, delimiter="\t", quoting=3)    
# read the training data stored in "trainingDataXXXX.txt"
#test = pd.read_csv("testData.txt", header=0, delimiter="\t", quoting=3)     
# read the test data stored in testData.txt
# note: data files are tab delimited

In [10]:
df

Unnamed: 0,id,sentiment,review
0,5814_8,1,"""With all this stuff going down at the moment ..."
1,2381_9,1,"""\The Classic War of the Worlds\"""" by Timothy ..."
2,7759_3,0,"""The film starts with a manager (Nicholas Bell..."
3,3630_4,0,"""It must be assumed that those who praised thi..."
4,9495_8,1,"""Superbly trashy and wondrously unpretentious ..."
5,8196_8,1,"""I dont know why people think this is such a b..."
6,7166_2,0,"""This movie could have been very good, but com..."
7,10633_1,0,"""I watched this video at a friend's house. I'm..."
8,319_1,0,"""A friend of mine bought this film for £1, and..."
9,8713_10,1,"""<br /><br />This movie is full of references...."


In [11]:
X_train, X_test, y_train, y_test = train_test_split(df["review"], df["sentiment"], test_size=0.2)

   
""" clean_my_text(): cleans the data with several replacements/deletions,
    tokenizes the text, and removes stopwords
    input: string data
    output: cleaned string data ready for sentiment analysis
"""
def clean_my_text(text):
    text = re.sub(r"<.*?>", "", text)      # quick removal of HTML tags
    text = re.sub("[^a-zA-Z]", " ", text)  # strip out all non-alpha chars
    text = text.strip().lower()            # convert all text to lowercase
    text = re.sub(" s ", " ", text)        # remove isolated s chars that 
                                           # result from cleaning possessives

    tokenizer = nltk.tokenize.TreebankWordTokenizer()  # tokenizes text using
                                                       # smart divisions
    tokens = tokenizer.tokenize(text)      # store results in tokens
    

    unstopped = []                         # holds the cleaned data string
    for word in tokens:
        if word not in stop_words:         # removes stopwords
            unstopped.append(word)         # adds word to unstopped string
    stemmer = nltk.stem.WordNetLemmatizer()   # consolidates different
                                                # word forms
    cleanText = " ".join(stemmer.lemmatize(token) for token in unstopped)
                # joins final clean tokens into a string
    return cleanText



""" clean_my_data() calls clean_my_text for each line of text in a dataset
    category  
    input: data file containing raw text  
    output: data file containing cleaned text entries
"""
def clean_my_data(dataList):
    print("Cleaning all of the data")
    i = 0
    for textEntry in dataList:              # reads line of text under 
                                                    # review category
        cleanElement = clean_my_text(textEntry)     # cleans line of text
        dataList[i] = cleanElement   # stores cleaned text
        i = i + 1
        if (i%50 == 0):
            print("Cleaning review number", i, "out of", len(dataList))
    print("Finished cleaning all of the data\n")
    return dataList


print("Operating on training data...\n")
reviews = X_train.tolist()
cleanReviewData = clean_my_data(reviews)            # cleans the training data
""" create_bag_of_words() generates the bag of words used to evaluate sentiment
    input: cleaned dataset
    output: tf-idf weighted sparse matrix
"""
def create_bag_of_words(X):
    from sklearn.feature_extraction.text import CountVectorizer
        # use scikit-learn for vectorization
    
    print ('Generating bag of words...')
    
    vectorizer = CountVectorizer(analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 ngram_range = (1,2), \
                                 max_features = 10000)
        # generates vectorization for ngrams of up to 2 words in length
        # this will greatly increase feature size, but gives more accurate
        # sentiment analysis since some word combinations have large
        # impact on sentiment ie: ("not good", "very fast")
                                                         
    train_data_features = vectorizer.fit_transform(X)
        # vectorizes sparse matrix
    train_data_features = train_data_features.toarray()
        # convert to a NumPy array for efficient matrix operations
    from sklearn.feature_extraction.text import TfidfTransformer
    tfidf = TfidfTransformer()
    tfidf_features = tfidf.fit_transform(train_data_features)
        # use tf-idf to weight features - places highest sentiment value on
        # low-frequency ngrams that are not too uncommon 
    return vectorizer, tfidf_features, tfidf



vectorizer, tfidf_features, tfidf  = (create_bag_of_words(cleanReviewData))   
        # stores the sparse matrix of the tf-idf weighted features


""" train_logistic_regression() uses logistic regression model to
    evaluate sentiment
    options: C sets how strong regularization will be: large C = small amount
    input: tf-idf matrix and the sentiment attached to the training example
    output: the trained logistic regression model
"""
def train_logistic_regression(features, label):
    print ("Training the logistic regression model...")
    from sklearn.linear_model import LogisticRegression
    ml_model = LogisticRegression(C = 100, random_state = 0, solver = 'liblinear')
    ml_model.fit(features, label)
    print ('Finished training the model\n')
    return ml_model


ml_model = train_logistic_regression(tfidf_features, y_train)
    # holds the trained model
    
print("Operating on test data...\n")
sentiments = X_test.tolist()
cleanTestData = clean_my_data(sentiments)
    # cleans the test data for accuracy evaluation

test_data_features = vectorizer.transform(cleanTestData)
test_data_features = test_data_features.toarray()
    # vectorizes the test data

test_data_tfidf_features = tfidf.fit_transform(test_data_features)
test_data_tfidf_features = test_data_tfidf_features.toarray()
    # tf-idf of test data ngrams

predicted_y = ml_model.predict(test_data_tfidf_features)
    # uses the trained logistic regression model to assign sentiment to each
    # test data example

correctly_identified_y = predicted_y == y_test
accuracy = np.mean(correctly_identified_y) * 100
print ('The accuracy of the model in predicting movie review sentiment is %.0f%%' %accuracy)
    # compares the predicted sentiment (predicted_y) vs the actual 
# value stored in "sentiment"

Operating on training data...

Cleaning all of the data
Cleaning review number 50 out of 4000
Cleaning review number 100 out of 4000
Cleaning review number 150 out of 4000
Cleaning review number 200 out of 4000
Cleaning review number 250 out of 4000
Cleaning review number 300 out of 4000
Cleaning review number 350 out of 4000
Cleaning review number 400 out of 4000
Cleaning review number 450 out of 4000
Cleaning review number 500 out of 4000
Cleaning review number 550 out of 4000
Cleaning review number 600 out of 4000
Cleaning review number 650 out of 4000
Cleaning review number 700 out of 4000
Cleaning review number 750 out of 4000
Cleaning review number 800 out of 4000
Cleaning review number 850 out of 4000
Cleaning review number 900 out of 4000
Cleaning review number 950 out of 4000
Cleaning review number 1000 out of 4000
Cleaning review number 1050 out of 4000
Cleaning review number 1100 out of 4000
Cleaning review number 1150 out of 4000
Cleaning review number 1200 out of 4000
Clea