In [31]:
import pandas as pd
import tensorflow as tf
import numpy as np

In [2]:
tweets=pd.read_csv('/content/imdb_10K_sentimnets_reviews.csv')

In [3]:
tweets.head()

Unnamed: 0,review,sentiment
0,"Okay, I know this does'nt project India in a g...",1
1,Despite John Travolta's statements in intervie...,0
2,"I am a kung fu fan, but not a Woo fan. I have ...",1
3,He seems to be a control freak. I have heard h...,0
4,"Admittedly, there are some scenes in this movi...",1


In [9]:
len(tweets[tweets['sentiment']==1]),len(tweets[tweets['sentiment']==0])

(5037, 4963)

In [10]:
import nltk 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import string 
import re
     

In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
#process tweets
def process_tweet(tweet):
    stemmer=PorterStemmer()
    stopwords_english = stopwords.words('english') #stopwords english
    
    #removing all hashtags ,hyperlinks
    tweet = re.sub(r'$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    
    tokenizer=TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
    tweet_tokens=tokenizer.tokenize(tweet) #tokenize
    
    tweets_clean=[]
    
    #removing stopwords, removing punctuation and then stemming the word
    for word in tweet_tokens:
        if (word not in stopwords_english and word not in string.punctuation):   
            stem_word=stemmer.stem(word)
            tweets_clean.append(stem_word)
            
    return tweets_clean

In [13]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM


In [14]:

#creating vocablory 
vocab=[]
for tweet in tweets.review:
  for word in process_tweet(tweet):
    if word not in vocab:
      vocab.append(word)


In [15]:
print("length of vocab:",len(vocab))

length of vocab: 48970


In [16]:

#process tweet
processed_tweet=[]
for t in tweets.review:
  p_t=process_tweet(t)
  processed_tweet.append(p_t)

In [17]:
tweets['p_tweet']=processed_tweet
tweets

Unnamed: 0,review,sentiment,p_tweet
0,"Okay, I know this does'nt project India in a g...",1,"[okay, know, does'nt, project, india, good, li..."
1,Despite John Travolta's statements in intervie...,0,"[despit, john, travolta', statement, interview..."
2,"I am a kung fu fan, but not a Woo fan. I have ...",1,"[kung, fu, fan, woo, fan, interest, gangster, ..."
3,He seems to be a control freak. I have heard h...,0,"[seem, control, freak, heard, comment, lose, c..."
4,"Admittedly, there are some scenes in this movi...",1,"[admittedli, scene, movi, seem, littl, unreali..."
...,...,...,...
9995,"A masterpiece.<br /><br />Thus it is, possibly...",1,"[masterpiec, br, br, thu, possibl, everyon, br..."
9996,Great movie about a great man. Thomas Kretschm...,1,"[great, movi, great, man, thoma, kretschmann, ..."
9997,"Before we start, may I say I hope you've alrea...",0,"[start, may, say, hope, alreadi, eaten, read, ..."
9998,I was so disappointed by this show. After hear...,0,"[disappoint, show, hear, read, hoopla, ground,..."


In [18]:
tokenizer = Tokenizer(num_words=len(vocab), split=' ') 
tokenizer.fit_on_texts(tweets['p_tweet'].values)
X = tokenizer.texts_to_sequences(tweets['p_tweet'])
X=pad_sequences(X)


In [37]:
len(X[0])

1185

In [32]:
Y=tweets['sentiment'].to_list()
Y=np.array(Y)

In [38]:
embeding_dim=128
lstm_out=196
model = Sequential()
model.add(Embedding(input_dim=len(vocab),output_dim=embeding_dim,input_length = 1185))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))



In [39]:
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.15, random_state = 42)

In [40]:
model.fit(X_train, y_train,validation_data = (X_test,y_test),epochs = 1, batch_size=32)



<keras.callbacks.History at 0x7f4000e2dc40>

In [41]:
model.save('movie_sentiment_model.h5')

In [43]:
test_tweet=tweets['review'][0]

In [76]:
test_tweet

"Okay, I know this does'nt project India in a good light. But the overall theme of the movie is not India, it's Shakti. The power of a warlord, and the power of a mother. The relationship between Nandini and her husband and son swallow you up in their warmth. Then things go terribly wrong. The interaction between Nandini and her father in law - the power of their dysfunctional relationship - and the lives changed by it are the strengths of this movie. Shah Rukh Khan's performance seems to be a mere cameo compared to the believable desperation of Karisma Kapoor. It is easy to get caught up in the love, violence and redemption of lives in this film, and find yourself heaving a sigh of relief and sadness at the climax. The musical interludes are strengths, believable and well done."

In [77]:
processed_tweet=process_tweet(test_tweet)
processed_tweet

['okay',
 'know',
 "does'nt",
 'project',
 'india',
 'good',
 'light',
 'overal',
 'theme',
 'movi',
 'india',
 'shakti',
 'power',
 'warlord',
 'power',
 'mother',
 'relationship',
 'nandini',
 'husband',
 'son',
 'swallow',
 'warmth',
 'thing',
 'go',
 'terribl',
 'wrong',
 'interact',
 'nandini',
 'father',
 'law',
 'power',
 'dysfunct',
 'relationship',
 'live',
 'chang',
 'strength',
 'movi',
 'shah',
 'rukh',
 "khan'",
 'perform',
 'seem',
 'mere',
 'cameo',
 'compar',
 'believ',
 'desper',
 'karisma',
 'kapoor',
 'easi',
 'get',
 'caught',
 'love',
 'violenc',
 'redempt',
 'live',
 'film',
 'find',
 'heav',
 'sigh',
 'relief',
 'sad',
 'climax',
 'music',
 'interlud',
 'strength',
 'believ',
 'well',
 'done']

In [80]:
Pro_seq_tweet=tokenizer.texts_to_sequences(processed_tweet)
Pro_seq_tweet

[[788],
 [38],
 [11297],
 [791],
 [2177],
 [7],
 [381],
 [354],
 [452],
 [2],
 [2177],
 [8938],
 [318],
 [19270],
 [318],
 [377],
 [431],
 [16020],
 [573],
 [403],
 [3769],
 [3918],
 [35],
 [30],
 [247],
 [289],
 [1703],
 [16020],
 [305],
 [1137],
 [318],
 [3615],
 [431],
 [78],
 [273],
 [1434],
 [2],
 [6871],
 [5846],
 [9552],
 [60],
 [40],
 [987],
 [1460],
 [523],
 [83],
 [967],
 [19271],
 [5011],
 [754],
 [10],
 [964],
 [28],
 [556],
 [2614],
 [78],
 [3],
 [59],
 [16021],
 [5012],
 [2154],
 [522],
 [1239],
 [75],
 [7521],
 [1434],
 [83],
 [20],
 [155]]

In [81]:
input_review=[]
for i in Pro_seq_tweet:
  input_review.append(i[0])
input_review

[788,
 38,
 11297,
 791,
 2177,
 7,
 381,
 354,
 452,
 2,
 2177,
 8938,
 318,
 19270,
 318,
 377,
 431,
 16020,
 573,
 403,
 3769,
 3918,
 35,
 30,
 247,
 289,
 1703,
 16020,
 305,
 1137,
 318,
 3615,
 431,
 78,
 273,
 1434,
 2,
 6871,
 5846,
 9552,
 60,
 40,
 987,
 1460,
 523,
 83,
 967,
 19271,
 5011,
 754,
 10,
 964,
 28,
 556,
 2614,
 78,
 3,
 59,
 16021,
 5012,
 2154,
 522,
 1239,
 75,
 7521,
 1434,
 83,
 20,
 155]

In [83]:
input_review1=pad_sequences([input_review],maxlen=1185)

In [84]:
model.predict(input_review1)



array([[0.8866404]], dtype=float32)

In [92]:
def review_list(review):
  input_review=[]
  for i in review:
    input_review.append(i[0])
  return input_review

def pos_neg(pred):
  if pred>0.5:
    return "positive review"
  else:
    return "negative review"

In [97]:
review="The Movie was good, i loved it!!"
pro_review=process_tweet(review)
pro_seq_review=tokenizer.texts_to_sequences(pro_review)
input_review=review_list(pro_seq_review)
pad_pro_seq=pad_sequences([input_review],maxlen=1185)


pred=model.predict(pad_pro_seq)
print("Prediction score",pred[0][0])

print()
print()
print("Review: ",review)
print("Predicted review: ",pos_neg(pred[0][0]))


Prediction score 0.78559446


Review:  The Movie was good, i loved it!!
Predicted review:  positive review


In [96]:
review="The Movie was bad, it was a waste of time!!"
pro_review=process_tweet(review)
pro_seq_review=tokenizer.texts_to_sequences(pro_review)
input_review=review_list(pro_seq_review)
pad_pro_seq=pad_sequences([input_review],maxlen=1185)


pred=model.predict(pad_pro_seq)
print("Prediction score",pred[0][0])

print()
print()
print("Review: ",review)
print("Predicted review: ",pos_neg(pred[0][0]))


Prediction score 0.042357087


Review:  The Movie was bad, it was a waste of time!!
Predicted review:  negative review


model prediction for top 6 test  tweets

In [63]:
model.predict(X_test[0:6])



array([[0.9643904 ],
       [0.01140774],
       [0.257417  ],
       [0.87569344],
       [0.02030325],
       [0.07362368]], dtype=float32)