# IMDB movie reviews sentimental analysis
## Import data

In [1]:
import pandas as pd
train = pd.read_csv("/Users/jaesuk/Documents/Springboard/IMDB/labeledTrainData.tsv", sep="\t")
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [2]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
id           25000 non-null object
sentiment    25000 non-null int64
review       25000 non-null object
dtypes: int64(1), object(2)
memory usage: 586.0+ KB


## Exploratory data analysis


In [3]:
# A graph to see how big each sentiment group is
import seaborn as sns
dist = train.groupby(["sentiment"]).size()
sns.barplot(dist.keys(), dist.values)
import matplotlib.pyplot as plt
plt.show()

<Figure size 640x480 with 1 Axes>

In [4]:
## Preprocessing

In [5]:
#Remove non-ASCII characters from list of tokenized words
def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

# Convert all characters to lowercase from list of tokenized words
def to_lowercase(words):
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

# Remove punctuation from list of tokenized words
def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

# Remove all interger occurrences in list of tokenized words with textual representation
def remove_numbers(words):
    new_words = []
    for word in words:
        new_word = re.sub("\d+", "", word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

# Remove stop words from list of tokenized words
from nltk.corpus import stopwords
english_stop_words = stopwords.words('english')
def remove_stopwords(words):
    new_words = []
    for word in words:
        if word not in english_stop_words:
            new_words.append(word)
    return new_words

# Stem words in list of tokenized words
def stem_words(words):
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

# Lemmatize verbs in list of tokenized words
def lemmatize_verbs(words):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_numbers(words)
    words = remove_stopwords(words)
    return words

In [6]:
# tokenizing phrases. then, remove ascii, change to lower case, and remove punctuation and numbers
import nltk
import unicodedata
import re
#nltk.download('punkt')
train['Words'] = train['review'].apply(nltk.word_tokenize)
train['Words'] = train['Words'].apply(normalize)

In [7]:
train.head(5)

Unnamed: 0,id,sentiment,review,Words
0,5814_8,1,With all this stuff going down at the moment w...,"[stuff, going, moment, mj, started, listening,..."
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...","[classic, war, worlds, timothy, hines, enterta..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,"[film, starts, manager, nicholas, bell, giving..."
3,3630_4,0,It must be assumed that those who praised this...,"[must, assumed, praised, film, greatest, filme..."
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,"[superbly, trashy, wondrously, unpretentious, ..."


In [8]:
train['new_review'] = 'a'
train.head()

Unnamed: 0,id,sentiment,review,Words,new_review
0,5814_8,1,With all this stuff going down at the moment w...,"[stuff, going, moment, mj, started, listening,...",a
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...","[classic, war, worlds, timothy, hines, enterta...",a
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,"[film, starts, manager, nicholas, bell, giving...",a
3,3630_4,0,It must be assumed that those who praised this...,"[must, assumed, praised, film, greatest, filme...",a
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,"[superbly, trashy, wondrously, unpretentious, ...",a


In [9]:
for i in range(0,len(train)):
    train.loc[i,'new_review'] =' '.join(train.loc[i,'Words'])

In [10]:
train.head()

Unnamed: 0,id,sentiment,review,Words,new_review
0,5814_8,1,With all this stuff going down at the moment w...,"[stuff, going, moment, mj, started, listening,...",stuff going moment mj started listening music ...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...","[classic, war, worlds, timothy, hines, enterta...",classic war worlds timothy hines entertaining ...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,"[film, starts, manager, nicholas, bell, giving...",film starts manager nicholas bell giving welco...
3,3630_4,0,It must be assumed that those who praised this...,"[must, assumed, praised, film, greatest, filme...",must assumed praised film greatest filmed oper...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,"[superbly, trashy, wondrously, unpretentious, ...",superbly trashy wondrously unpretentious explo...


## Feature engineering

In [11]:
# Build bag of words features
from sklearn.feature_extraction.text import CountVectorizer
#import nltk
vectorizer = CountVectorizer(min_df=100, max_df=0.98, tokenizer=nltk.word_tokenize)
vector = vectorizer.fit_transform(train['new_review'])

In [12]:
# Build TFIDF features 
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(vector)

In [13]:
vectorizer.get_feature_names()

['abandoned',
 'abilities',
 'ability',
 'able',
 'absence',
 'absolute',
 'absolutely',
 'absurd',
 'abuse',
 'academy',
 'accent',
 'accents',
 'accept',
 'acceptable',
 'accepted',
 'accident',
 'accidentally',
 'accomplished',
 'according',
 'account',
 'accurate',
 'accused',
 'achieve',
 'achieved',
 'achievement',
 'across',
 'act',
 'acted',
 'acting',
 'action',
 'actions',
 'actor',
 'actors',
 'actress',
 'actresses',
 'acts',
 'actual',
 'actually',
 'ad',
 'adam',
 'adaptation',
 'adapted',
 'add',
 'added',
 'adding',
 'addition',
 'adds',
 'adequate',
 'admire',
 'admit',
 'admittedly',
 'adult',
 'adults',
 'advance',
 'advantage',
 'adventure',
 'adventures',
 'advice',
 'affair',
 'affected',
 'aforementioned',
 'afraid',
 'africa',
 'african',
 'afternoon',
 'afterwards',
 'age',
 'aged',
 'agent',
 'ages',
 'ago',
 'agree',
 'ah',
 'ahead',
 'ai',
 'aimed',
 'air',
 'aired',
 'aka',
 'al',
 'alan',
 'alas',
 'albeit',
 'albert',
 'alex',
 'alice',
 'alien',
 'aliens

In [14]:
tfidf.shape

(25000, 3603)

## Machine learning models

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [17]:
# Logistic regression
X_train1, X_test1, y_train1, y_test1 = train_test_split(tfidf,train['sentiment'], test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train1, y_train1)

y_pred1 = logreg.predict(X_test1)
print(confusion_matrix(y_test1,y_pred1)) 
print(classification_report(y_test1,y_pred1))

[[3311  478]
 [ 409 3302]]
             precision    recall  f1-score   support

          0       0.89      0.87      0.88      3789
          1       0.87      0.89      0.88      3711

avg / total       0.88      0.88      0.88      7500



In [19]:
#SVC
from sklearn.svm import SVC 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
X_train2, X_test2, y_train2, y_test2 = train_test_split(tfidf,train['sentiment'], test_size=0.3, random_state=0)
svclassifier = SVC(kernel='linear')  
svclassifier.fit(X_train2, y_train2)  
y_pred2 = svclassifier.predict(X_test2)  
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test2,y_pred2))  
print(classification_report(y_test2,y_pred2))

[[3299  490]
 [ 435 3276]]
             precision    recall  f1-score   support

          0       0.88      0.87      0.88      3789
          1       0.87      0.88      0.88      3711

avg / total       0.88      0.88      0.88      7500



In [20]:
# Naive bayes
import sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
X_train3, X_test3, y_train3, y_test3 = train_test_split(tfidf,train['sentiment'], test_size=0.3, random_state=0)
clf = MultinomialNB().fit(X_train3, y_train3)
y_pred3 = clf.predict(X_test3)
sklearn.metrics.accuracy_score(y_test3, y_pred3)
print(confusion_matrix(y_test3,y_pred3))  
print(classification_report(y_test3,y_pred3))

[[3216  573]
 [ 506 3205]]
             precision    recall  f1-score   support

          0       0.86      0.85      0.86      3789
          1       0.85      0.86      0.86      3711

avg / total       0.86      0.86      0.86      7500



### I will choose logistic regression model to predict movie review. It has the highest change of predicting true positives. 