In [43]:
#import required libraries
import pandas as pd
import string
from nltk.corpus import stopwords

In [44]:
df_sentiment_data = pd.read_csv('imdb_labelled.txt',sep='\t',names= ['comment','label'])

In [45]:
df_sentiment_data.head()

Unnamed: 0,comment,label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [46]:
#get the info the dataset
df_sentiment_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 2 columns):
comment    748 non-null object
label      748 non-null int64
dtypes: int64(1), object(1)
memory usage: 11.8+ KB


In [47]:
df_sentiment_data.groupby('label').describe()

Unnamed: 0_level_0,comment,comment,comment,comment
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,362,361,Not recommended.,2
1,386,384,Definitely worth checking out.,2


In [48]:
#verify the length of the comments and add it as feature to the dataset
df_sentiment_data['length'] = df_sentiment_data['comment'].apply(len)

In [49]:
df_sentiment_data.head()

Unnamed: 0,comment,label,length
0,"A very, very, very slow-moving, aimless movie ...",0,87
1,Not sure who was more lost - the flat characte...,0,99
2,Attempting artiness with black & white and cle...,0,188
3,Very little music or anything to speak of.,0,44
4,The best scene in the movie was when Gerardo i...,1,108


In [50]:
#view the first
df_sentiment_data[df_sentiment_data['length']>50]['comment'].iloc[0]

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

In [51]:
#start text processing with vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [52]:
#define the function remove the stopwords from the comments
def remove_stopwords(comment):
    no_punctuation = [char for char in comment if char not in string.punctuation]
    #form the sentence
    no_punctuation = ''.join(no_punctuation)
    #eliminate the stopwords
    return[word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]
    

In [53]:
#bag of words by fitting data into it
bag_of_words = CountVectorizer(analyzer=remove_stopwords).fit(df_sentiment_data['comment'])

In [55]:
#apply transform method for the bag of words
comment_bag_of_words = bag_of_words.transform(df_sentiment_data['comment'])

In [59]:
#apply tfidf transformer and fit the data into it
from sklearn.feature_extraction.text import TfidfTransformer
TfidfTransform = TfidfTransformer().fit(comment_bag_of_words)

In [61]:
#print the shape of tfidf transformer
comment_tfidf_transformer = TfidfTransform.transform(comment_bag_of_words)
print(comment_tfidf_transformer.shape)

(748, 3259)


In [63]:
#apply naive-byes model to detect the spam and fit the data into it
from sklearn.naive_bayes import MultinomialNB 
sentiment_detection_mmodel = MultinomialNB().fit(comment_tfidf_transformer,df_sentiment_data['label'])

In [67]:
#check the model for the predicted and actual values
comment = df_sentiment_data['comment'][4]
bag_of_words_comment_test = bag_of_words.transform([comment])
tfidf = TfidfTransform.transform(bag_of_words_comment_test)

print('Predicted',sentiment_detection_mmodel.predict(tfidf)[0])
print('Actual',df_sentiment_data['label'][4])

Predicted 1
Actual 1
