## Importing the Dataset

In [275]:
import numpy as np
import re
import pickle 
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files
nltk.download('stopwords')


# Importing the dataset
reviews = load_files('/Users/praga/Downloads/txt_sentoken/')
X,y = reviews.data,reviews.target

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\praga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [276]:
X[0]

"arnold schwarzenegger has been an icon for action enthusiasts , since the late 80's , but lately his films have been very sloppy and the one-liners are getting worse . \nit's hard seeing arnold as mr . freeze in batman and robin , especially when he says tons of ice jokes , but hey he got 15 million , what's it matter to him ? \nonce again arnold has signed to do another expensive blockbuster , that can't compare with the likes of the terminator series , true lies and even eraser . \nin this so called dark thriller , the devil ( gabriel byrne ) has come upon earth , to impregnate a woman ( robin tunney ) which happens every 1000 years , and basically destroy the world , but apparently god has chosen one man , and that one man is jericho cane ( arnold himself ) . \nwith the help of a trusty sidekick ( kevin pollack ) , they will stop at nothing to let the devil take over the world ! \nparts of this are actually so absurd , that they would fit right in with dogma . \nyes , the film is t

In [277]:
y[0] # indicated Negative Review

0

In [278]:
X[1]

"good films are hard to find these days . \ngreat films are beyond rare . \nproof of life , russell crowe's one-two punch of a deft kidnap and rescue thriller , is one of those rare gems . \na taut drama laced with strong and subtle acting , an intelligent script , and masterful directing , together it delivers something virtually unheard of in the film industry these days , genuine motivation in a story that rings true . \nconsider the strange coincidence of russell crowe's character in proof of life making the moves on a distraught wife played by meg ryan's character in the film -- all while the real russell crowe was hitching up with married woman meg ryan in the outside world . \ni haven't seen this much chemistry between actors since mcqueen and mcgraw teamed up in peckinpah's masterpiece , the getaway . \nbut enough with the gossip , let's get to the review . \nthe film revolves around the kidnapping of peter bowman ( david morse ) , an american engineer working in south america 

In [279]:
y[1] #indicated Positive Review

1

In [280]:
len(X) # 1000 positive, 1000 negative classes

2000

In [281]:
len(y)

2000

## Pickling and Unpickling

In [282]:
# storing as Pickle Files
with open('/Users/praga/Downloads/X.pickle','wb') as f:
    pickle.dump(X,f)
    
with open('/Users/praga/Downloads/y.pickle','wb') as f:
    pickle.dump(y,f)



In [283]:
# Unpickling dataset
X_in = open('X.pickle','rb')
y_in = open('y.pickle','rb')
X = pickle.load(X_in)
y = pickle.load(y_in)

## Preprocessing the Dataset

In [284]:
corpus = []
for i in range(0, len(X)):
    review = re.sub(r'\W', ' ', str(X[i])) # remove one letter characters such as , . ' - 
    review = review.lower()                # lower case
    review = re.sub(r'\s+[a-z]\s+', ' ',review) # remove one letter words
    review = re.sub(r'^[a-z]\s+', ' ',review)   # remove single characters at start of sentence
    review = re.sub(r'\s+', ' ', review)  # to remove spaces
    corpus.append(review)    

In [285]:
corpus[0]

'arnold schwarzenegger has been an icon for action enthusiasts since the late 80 but lately his films have been very sloppy and the one liners are getting worse it hard seeing arnold as mr freeze in batman and robin especially when he says tons of ice jokes but hey he got 15 million what it matter to him once again arnold has signed to do another expensive blockbuster that can compare with the likes of the terminator series true lies and even eraser in this so called dark thriller the devil gabriel byrne has come upon earth to impregnate woman robin tunney which happens every 1000 years and basically destroy the world but apparently god has chosen one man and that one man is jericho cane arnold himself with the help of trusty sidekick kevin pollack they will stop at nothing to let the devil take over the world parts of this are actually so absurd that they would fit right in with dogma yes the film is that weak but it better than the other blockbuster right now sleepy hollow but it mak

## Bag of Words Model

In [286]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features = 3000, # 3000 most frequent words as features
                             min_df = 3,          # exludes words appearing in 3 or less than 3 corpus
                             max_df = 0.6,        # exludes words appearing in more than 60% of corpus
                             stop_words = stopwords.words('english')) # removes stop words


In [287]:
X = vectorizer.fit_transform(corpus).toarray()

In [288]:
print(X[0:10])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]]


In [289]:
len(X[0])

3000

## TFIDF model

In [290]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()

In [291]:
X = transformer.fit_transform(X).toarray() # converting bag of words model to TFIDF model

In [292]:
X[0:10]

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.06443538, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [293]:
set(X[0][:])

{0.0,
 0.0221003171712028,
 0.022541925402130597,
 0.022800619827195213,
 0.022918597728650427,
 0.02323811445090185,
 0.023647960848482634,
 0.02552831385152568,
 0.025897364868670673,
 0.027902966540636388,
 0.028515768852630626,
 0.028792471555060976,
 0.029446082364716,
 0.029487955943464546,
 0.029870516935955247,
 0.030241603372033386,
 0.030330354517940042,
 0.030851798374917812,
 0.030874912852950395,
 0.03113170280495772,
 0.031226251806465904,
 0.031249988373347325,
 0.031417271788367285,
 0.0316842191169229,
 0.03210697065569742,
 0.03238742296241644,
 0.03416910723275282,
 0.034826960427759596,
 0.03498084838339925,
 0.035230613942288345,
 0.036447015641086836,
 0.037259660354127276,
 0.037590842363297225,
 0.03831662160197629,
 0.038395236767330754,
 0.03920645927910752,
 0.0395017755867273,
 0.039803404474263684,
 0.039890781754574466,
 0.041762579094563566,
 0.04211728250793498,
 0.04216869016290847,
 0.0422720707401824,
 0.042963161807521265,
 0.04340590826295008,
 0.04

In [294]:
# Creating the Tf-Idf model directly
#from sklearn.feature_extraction.text import TfidfVectorizer
#vectorizer = TfidfVectorizer(max_features = 2000, min_df = 3, max_df = 0.6, stop_words = stopwords.words('english'))
#X = vectorizer.fit_transform(corpus).toarray()

## Creating Training and Testing Set

In [295]:
from sklearn.model_selection import train_test_split
text_train, text_test, sent_train, sent_test = train_test_split(X, y, test_size = 0.20 , random_state = 1)

## Training the Classifier

### Logistic Regression

In [296]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(text_train,sent_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Prediction

In [297]:
sent_pred = classifier.predict(text_test)

### Model Performance

In [298]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(sent_test, sent_pred)
cm

array([[169,  22],
       [ 30, 179]], dtype=int64)

In [299]:
print"The accuracy is",(float(cm[0][0]+cm[1][1]))/4,"%."

The accuracy is 87.0 %.


### Testing the model

In [300]:
def analyse_sentiment(sentence):
    sample = [sentence]
    sample = vectorizer.transform(sample).toarray()
    sample = transformer.fit_transform(sample).toarray()
    sentiment = classifier.predict(sample)
    if sentiment[0]==0:
        print("It's a BAD review")
    elif sentiment[0]==1 :
        print("It's a GOOD review")

In [301]:
analyse_sentiment("Christian Bale acted really well in batman movie")

It's a GOOD review


In [302]:
analyse_sentiment("The drummer played well")

It's a GOOD review


In [303]:
analyse_sentiment("your song was bad")

It's a BAD review


In [304]:
analyse_sentiment("You are not a nice man, have a bad life")

It's a BAD review


#### Failed Cases

In [305]:
analyse_sentiment("I did not like your song")

It's a GOOD review


In [306]:
analyse_sentiment("The singer spoiled the song")

It's a GOOD review


In [307]:
analyse_sentiment("Little did he know that he had scored the highest marks")

It's a BAD review


### Saving our Model

In [308]:
# Saving our classifier
with open('/Users/praga/Downloads/classifier.pickle','wb') as f:
    pickle.dump(classifier,f)
    
# Saving the Tf-Idf model
with open('/Users/praga/Downloads/tfidfmodel.pickle','wb') as f:
    pickle.dump(vectorizer,f)
    

# Using our classifier
#with open('tfidfmodel.pickle','rb') as f:
    #tfidf = pickle.load(f)
    
#with open('classifier.pickle','rb') as f:
    #clf = pickle.load(f)