Import necessary Libraries

In [29]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import html
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /Users/Rohit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Rohit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [30]:
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.metrics.scorer import make_scorer
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

%matplotlib inline

Read the Training/Test Data File

In [31]:
file_path = "/Users/Rohit/Desktop/imdb"
train_file_name = "train.csv"
test_file_name  = "test.csv"
 
train_data = pd.read_csv(file_path+'/'+train_file_name, encoding='latin-1')
print(train_data.shape)
print(train_data.head())
print("")

test_data = pd.read_csv(file_path+'/'+test_file_name, encoding='latin-1')
print(test_data.shape)
print(test_data.head())

(25000, 3)
   row_Number                                               text  polarity
0        2148  first think another Disney movie, might good, ...         1
1       23577  Put aside Dr. House repeat missed, Desperate H...         0
2        1319  big fan Stephen King's work, film made even gr...         1
3       13358  watched horrid thing TV. Needless say one movi...         0
4        9495  truly enjoyed film. acting terrific plot. Jeff...         1

(25000, 2)
   row_number                                               text
0           0  Oh gosh!! I love movie sooooooooooooooooooooo ...
1           1  I saw Borderline several years ago AMC. I've l...
2           2  Let say GRANNY extremely well made horror viol...
3           3  I like Full Moon Pictures I ordered movie USA,...
4           4  Worst horror film ever funniest film ever roll...


Analyze the text data and clean the data as required

In [32]:
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))
 
train_data['word_count'] = train_data['text'].apply(lambda x: len(str(x).split(" ")))   ## Word count
train_data['char_count'] = train_data['text'].str.len()                                ## characters count, includes spaces
train_data['avg_word']   = train_data['text'].apply(lambda x: avg_word(x)) ## Average Words

#Count the number of Stop Words
train_data['stopwords']  = train_data['text'].apply(lambda x: len([x for x in x.split() if x in stop]))

#numeric data
train_data['numerics'] = train_data['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train_data.head()

Unnamed: 0,row_Number,text,polarity,word_count,char_count,avg_word,stopwords,numerics
0,2148,"first think another Disney movie, might good, ...",1,52,314,5.057692,1,2
1,23577,"Put aside Dr. House repeat missed, Desperate H...",0,86,565,5.581395,2,4
2,1319,"big fan Stephen King's work, film made even gr...",1,193,1268,5.57513,3,1
3,13358,watched horrid thing TV. Needless say one movi...,0,63,414,5.587302,1,0
4,9495,truly enjoyed film. acting terrific plot. Jeff...,1,65,477,6.353846,2,0


Start Pre-processing of the text data

In [33]:
#Define a Functio to clean the data, remove junk characters, and stem data
def cleanse_data(x, freq_com, freq_rare):
    re1 = re.compile(r'  +')
    # Stemming Data
    '''Removal of suffices, like “ing”, “ly”, “s”, etc. by a simple rule-based approach. For this purpose,
       we will use SnowballStemmer from the NLTK library.
    '''

    # stemmer = SnowballStemmer('english')
    lemmatizer = WordNetLemmatizer()
    x = ' '.join([lemmatizer.lemmatize(word) for word in str(x).split(' ')])
    
    # Remove the junk characters
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    x = re1.sub(' ', html.unescape(x))

    #Lower Case the Word
    ''' Transform our reviews into lower case. This avoids having multiple copies of the same words.
        For example, while calculating the word count, ‘Analytics’ and ‘analytics’ will be taken as different words.
    '''
    
    x = " ".join(x.lower() for x in x.split())   

    # Removing Punctuation
    ''' Remove punctuation, as it doesn’t add any extra information while treating text data. Therefore
        removing all instances of it will help us reduce the size of the training data.
    '''
    x = x.replace('[^\w\s]','')

    # Removal of Stop Words
    ''' Stop words (or commonly occurring words) should be removed from the text data. For this purpose,
        we can either create a list of stopwords ourselves or we can use predefined libraries.
    '''
    x = " ".join(x for x in x.split() if x not in stop)

    #Common word removal
    ''' Remove commonly occurring words from our text data First, let’s check the 10 most frequently occurring
        words in our text data then take call to remove or retain.
    '''
    x = " ".join(x for x in x.split() if x not in freq_com)

    #Rare words removal
    ''' Remove rarely occurring words from the text. Because they’re so rare, the association between them and
        other words is dominated by noise.
    '''
    x = " ".join(x for x in x.split() if x not in freq_rare)   

    return x

Common Word Removal

In [34]:
#Common word removal
''' Remove commonly occurring words from our text data First, let’s check the 10 most frequently occurring
    words in our text data then take call to remove or retain.
'''
freq_com = pd.Series(' '.join(train_data['text']).split()).value_counts()[:10]
freq_com = list(freq_com.index)

#Rare words removal
''' Remove rarely occurring words from the text. Because they’re so rare, the association between them and
    other words is dominated by noise.
'''
freq_rare = pd.Series(' '.join(train_data['text']).split()).value_counts()[-10:]
freq_rare = list(freq_rare.index)
    
#Clean the data remove junk characters, stem data
train_data['text'] = train_data['text'].apply(lambda x: cleanse_data(x, freq_com, freq_rare))
train_data.head()

Unnamed: 0,row_Number,text,polarity,word_count,char_count,avg_word,stopwords,numerics
0,2148,"first think another disney movie, might good, ...",1,52,314,5.057692,1,2
1,23577,"put aside dr. house repeat missed, desperate h...",0,86,565,5.581395,2,4
2,1319,"big fan stephen king's work, made greater fan ...",1,193,1268,5.57513,3,1
3,13358,watched horrid thing tv. needless say watch se...,0,63,414,5.587302,1,0
4,9495,truly enjoyed film. acting terrific plot. jeff...,1,65,477,6.353846,2,0


Start Data Modelling and Prediction

In [35]:
X_train, X_test, y_train, y_test = train_test_split(train_data['text'], train_data['polarity'], test_size=0.40, random_state=42 )
log_loss_scorer = make_scorer(log_loss)

In [48]:
from sklearn.feature_selection import chi2, SelectKBest 
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import MaxAbsScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

In [57]:
%%time

# set a reasonable number of features before adding interactions
chi_k = 300

# create the pipeline object
pipeline = Pipeline([('vect', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                non_negative=True,
                                                norm=None,
                                                binary=False,
                                                ngram_range=(1, 2)
                                               )),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

pipeline.fit(train_data['text'], train_data['polarity'])

CPU times: user 6.15 s, sys: 263 ms, total: 6.41 s
Wall time: 6.53 s


In [58]:
pipeline.score(X_test, y_test)

0.9857

Predict on the Test Data

In [59]:
test_data['text'] = test_data['text'].apply(lambda x: cleanse_data(x, freq_com, freq_rare))
test_data['labels'] = pipeline.predict_proba(test_data['text'])[::,1]
test_data['labels_predict'] = pipeline.predict(test_data['text'])
test_data.head(30)

Unnamed: 0,row_number,text,labels,labels_predict
0,0,oh gosh!! love sooooooooooooooooooooo much!!!!...,0.703538,1
1,1,saw borderline several year ago amc. i've look...,0.620915,1
2,2,let say granny extremely well made horror viol...,0.542915,1
3,3,"full moon picture ordered usa, germany can't g...",0.353424,0
4,4,worst horror ever funniest ever rolled got see...,0.395889,0
5,5,first saw teen last year junior high. riveted ...,0.677679,1
6,6,old jess franco! always-reliable choice direct...,0.352146,0
7,7,dogtown z-boyssummary: dogtown z-boys document...,0.682829,1
8,8,"rigoletto verdi's masterpiece, full drama, emo...",0.440873,0
9,9,high expectation dawn. know keep buying slashe...,0.31352,0


Generate the Output CSV File

In [60]:
test_data[['row_number', 'labels', 'labels_predict']].to_csv(file_path+"/sample_sub.csv", index=False)