In [1]:
import pandas as pd
import string
import nltk

In [2]:
from nltk.corpus import stopwords

In [3]:
#Read the SpamCollection file
df = pd.read_csv('imdb_labelled.txt',sep='\t',names=['Comment','Label'])

In [4]:
df

Unnamed: 0,Comment,Label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
743,I just got bored watching Jessice Lange take h...,0
744,"Unfortunately, any virtue in this film's produ...",0
745,"In a word, it is embarrassing.",0
746,Exceptionally bad!,0


In [5]:
#Display first 5 Records
df.head()

Unnamed: 0,Comment,Label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [6]:
# Group by to find the count of positive  and negative comments. Label - 1 : positive, 0 : Negative
df.groupby('Label').describe() 

Unnamed: 0_level_0,Comment,Comment,Comment,Comment
Unnamed: 0_level_1,count,unique,top,freq
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,362,361,Not recommended.,2
1,386,384,Definitely worth checking out.,2


In [7]:
# Find the length of messages
df['length'] = df['Comment'].apply(len)

In [8]:
df['length']

0       87
1       99
2      188
3       44
4      108
      ... 
743     63
744     92
745     32
746     20
747     75
Name: length, Length: 748, dtype: int64

In [9]:
# STEPS:
# 1. Remove punctuations and stopwords. Split the sentence into words. This process is called tokenization. 
# 2. Apply CountVectorizer and Transform. This converts the words to a integer or float. This process is called as Feature Extraction.
# 3. Apply TF/IDF transform - Term Frequency and Inverse Document Frequency 
# 4. Split data into train and test
# 5. Using Naive Bayes Classification, first train the model with train data.
# 6. Test the model and get the prediction.
# 7. Compare prediction vs Actual and get the confusion matrix.
# 8. Get the accuracy score for the model.

In [10]:
# function to remove punctuations and stopwords
def message_text_process(message):   
    no_punct = [char for char in message if char not in string.punctuation]
    no_punct = ''.join(no_punct)    
    return [word for word in no_punct.split() if word.lower() not in stopwords.words('english')]           
                

In [11]:
# test the above function to see if its working
df['Comment'].head(5).apply(message_text_process)

0    [slowmoving, aimless, movie, distressed, drift...
1    [sure, lost, flat, characters, audience, nearl...
2    [Attempting, artiness, black, white, clever, c...
3                     [little, music, anything, speak]
4    [best, scene, movie, Gerardo, trying, find, so...
Name: Comment, dtype: object

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
# Apply CountVectorizer - Convert a collection of text documents to a matrix of token counts.
vectorization = CountVectorizer(analyzer = message_text_process )


In [14]:
# TRIAL ---- try out the Count Vectorizer and Transform for only 1 record to see the output.
# try out the count vectorizer for the first record alone
bag_of_words_transformer_try = vectorization.fit(df['Comment'].head(1))
# print out the bag_of_words_transformer
print(bag_of_words_transformer_try.vocabulary_)
# transform the first record alone
comment_try = bag_of_words_transformer_try.transform(df['Comment'].head(1))
print(comment_try)


{'slowmoving': 5, 'aimless': 0, 'movie': 4, 'distressed': 1, 'drifting': 2, 'young': 6, 'man': 3}
  (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 6)	1


In [15]:
#Apply CountVectorizer to the entire df
bag_of_words_transformer = vectorization.fit(df['Comment'])
print(len(bag_of_words_transformer.vocabulary_))
comment = bag_of_words_transformer.transform(df['Comment'])
print(comment)

3259
  (0, 808)	1
  (0, 1350)	1
  (0, 1377)	1
  (0, 2066)	1
  (0, 2153)	1
  (0, 2723)	1
  (0, 3250)	1
  (1, 889)	1
  (1, 1068)	1
  (1, 1608)	1
  (1, 1741)	1
  (1, 2037)	1
  (1, 2183)	1
  (1, 2862)	1
  (1, 3143)	1
  (2, 87)	1
  (2, 774)	1
  (2, 817)	1
  (2, 837)	1
  (2, 866)	1
  (2, 931)	1
  (2, 956)	1
  (2, 1021)	1
  (2, 1111)	1
  (2, 1334)	1
  :	:
  (743, 389)	1
  (743, 421)	1
  (743, 982)	1
  (743, 1119)	1
  (743, 1712)	1
  (743, 2883)	1
  (743, 3164)	1
  (744, 702)	1
  (744, 1587)	1
  (744, 2037)	1
  (744, 2413)	1
  (744, 2495)	1
  (744, 2610)	1
  (744, 3125)	1
  (744, 3215)	1
  (745, 1418)	1
  (745, 3212)	1
  (746, 254)	1
  (746, 911)	1
  (747, 1822)	1
  (747, 1891)	1
  (747, 1895)	1
  (747, 2135)	1
  (747, 2247)	1
  (747, 3158)	1


In [16]:
# Apply Tf/IDF
#TF/IDF transform - eg
#Consider a document containing 100 words wherein the word cat appears 3 times. 
#The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. 
#Now, assume we have 10 million documents and the word cat appears in one thousand of these. 
#Then, the inverse document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4. 
#Thus, the Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.
from sklearn.feature_extraction.text import TfidfTransformer

In [17]:
#try out the tfidf for only 1 record
tfidf_transformer_try = TfidfTransformer().fit(comment_try)
comment_tfidf_try = tfidf_transformer_try.transform(comment_try)
print(comment_tfidf_try.data)


[0.37796447 0.37796447 0.37796447 0.37796447 0.37796447 0.37796447
 0.37796447]


In [18]:
# Apply Tf/IDF for the whole df
tfidf_transformer = TfidfTransformer().fit(comment)
comment_tfidf = tfidf_transformer.transform(comment)
print(comment_tfidf.shape)
print(comment_tfidf.data)


(748, 3259)
[0.38166373 0.42411082 0.16990858 ... 0.41434428 0.41434428 0.41434428]


In [19]:
# Split data into train(70%) and test(30%)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(comment_tfidf, df['Label'], test_size=0.30, random_state = 50)    


In [20]:
y_train

249    1
303    1
540    0
476    1
467    0
      ..
132    0
289    1
109    0
480    1
688    1
Name: Label, Length: 523, dtype: int64

In [21]:
# Use Naive Bayes to detect sentiment
# Train the model first using train data
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train,y_train)

In [22]:
# Test the model using test data. Make predictions
predictions = spam_detect_model.predict(X_test)

In [23]:
print('predicted', predictions)
print(predictions.shape)

predicted [1 0 1 0 1 1 1 1 0 0 0 1 1 1 1 1 1 1 0 0 1 1 0 0 1 1 0 0 0 0 0 1 1 1 0 0 0
 0 1 1 1 0 0 1 0 1 0 0 1 1 1 0 0 0 0 0 1 0 0 1 1 1 1 1 1 0 1 1 0 1 1 0 0 1
 1 1 1 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 0 0 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 0
 0 1 1 1 0 1 1 0 1 1 1 0 1 0 0 1 1 0 1 0 0 1 0 0 0 1 0 1 1 1 0 1 1 1 0 0 0
 1 1 0 0 1 1 0 1 0 1 1 1 1 0 0 0 1 1 1 0 1 0 0 0 1 0 1 0 1 0 1 0 1 0 0 1 0
 0 1 0 1 0 1 0 0 1 0 1 0 0 1 0 1 1 1 0 0 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 1 1
 0 1 0]
(225,)


In [24]:
print('actual', y_test)

actual 537    0
265    0
626    1
550    1
202    1
      ..
38     1
339    0
653    0
145    1
216    1
Name: Label, Length: 225, dtype: int64


In [25]:
# Compare Predicted vs Actual using Confusion Matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,predictions))

[[74 33]
 [27 91]]


In [26]:
# Calculate accuracy score
from sklearn.metrics import accuracy_score

In [27]:
from sklearn.metrics import accuracy_score
print (accuracy_score(y_test, predictions))

0.7333333333333333


In [28]:
y_test

537    0
265    0
626    1
550    1
202    1
      ..
38     1
339    0
653    0
145    1
216    1
Name: Label, Length: 225, dtype: int64

In [29]:
# Print the comments which are predicted incorrectly.
incorrect = y_test.get(y_test != predictions)
print(incorrect.count())
comment_x_test = df['Comment'].filter(items=incorrect.index)
pd.options.display.max_colwidth = 100
print(comment_x_test)


60
537                                                                                   Nothing new there.  
550          I'm a big fan of this series mostly due to Anne Rice's style, sensitivities and treatments.  
350                                                  The original Body and Soul (1947) is a masterpiece.  
115                  Artless camera-work endlessly presents us with the ugliest setting imaginable, i.e.  
46                                           I love Lane, but I've never seen her in a movie this lousy.  
86                                                                              A truly, truly bad film.  
625                                                     I struggle to find anything bad to say about it.  
423    A mature, subtle script that suggests and occasionally brings into dramatic focus the underlying...
222                                                                                        It just blew.  
16     This review is long overdue

In [30]:
test = pd.DataFrame({'Predicted':predictions,'Actual':y_test})
print('total test \n', test.count())
print('Predicted not equal to Actual:\n', test[test['Predicted'] != test['Actual']].count())
print('Predicted equal to Actual:\n', test[test['Predicted'] == test['Actual']].count())


total test 
 Predicted    225
Actual       225
dtype: int64
Predicted not equal to Actual:
 Predicted    60
Actual       60
dtype: int64
Predicted equal to Actual:
 Predicted    165
Actual       165
dtype: int64
