In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm

import keras
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.optimizers import adam

Using TensorFlow backend.


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
root_path = 'gdrive/My Drive/Train.csv'

In [None]:
data=pd.read_csv(root_path)

In [None]:
data.head()
text = data['text'].values

In [None]:
data.head()

In [None]:
word_list ={}
for i in tqdm(text):
  for j in i.split():
    if j in word_list:
      word_list[j] +=1
    else:
      word_list[j]=0


word_index={}
number=4
for i in tqdm(text):
  for j in i.split():
    if  word_list[j] > 100:
      if j not in word_index:
        word_index[j]= number
        number+=1

print()
print("Number of unique words in the dictionary : ",len(word_index))

In [None]:
# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def encode_review(text):
  sent=[word_index["<START>"]]
  for i in text.split():
    if i in word_index:
      sent.append(word_index[i])
    else:
      sent.append(word_index['<UNUSED>'])
  return sent

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])


In [None]:
encode_review('I love this movie')

In [None]:
decode_review([1,13,595,104,48])

In [None]:
train_x=[]
for i in tqdm(text):
  train_x.append(encode_review(i))

In [None]:
# Preparing the data for training

train_data = keras.preprocessing.sequence.pad_sequences(train_x,
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=256)


In [None]:
# Creating the model

vocab_size = 10000

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())

model.add(Dropout(0.2))
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(32,activation='relu'))
model.add(Dropout(0.2))

model.add(keras.layers.Dense(16,activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

model.summary()


In [None]:
# Compiling the model

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])


In [None]:
train_y = data['label'].values

In [None]:
history = model.fit(train_data,
                    train_y,
                    epochs=50,
                    validation_split=0.2,
                    batch_size=512,
                    verbose=1)


In [None]:
# Now we need to load our dataset and try to predict the sentiment
# That is to predict the sentiment of the game reviews

In [None]:
print(encode_review('hello there'))

decode_review(encode_review("hello there"))

In [None]:
emb = keras.preprocessing.sequence.pad_sequences([encode_review('Though it was bad we enjoyed the movie and we liked it')],
                                                       value=word_index["<PAD>"],
                                                       padding='post',
                                                       maxlen=256)


model.predict(emb)

In [None]:
def classify(text):
  emb = keras.preprocessing.sequence.pad_sequences([encode_review(text)],
                                                  value=word_index['<PAD>'],
                                                  padding='post',
                                                  maxlen=256)
  
  pred = model.predict(emb)
  
  #print("prediction obtained : ", pred)
  
  if (pred*100) >50.0 :
    #print("Positive review")
    return "positive"
  else:
    #print("Negative review")
    return "negative"
 

In [None]:
classify("I dont like this game")

In [None]:
classify('Though it was bad we enjoyed the movie and we liked it')

In [None]:

# load the review of the dataset and make the prediction as show above

In [None]:
history_dict = history.history


acc = history_dict['acc']
val_acc = history_dict['val_acc']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()


In [None]:
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()


In [None]:
#reading the steam dataset
path='gdrive/My Drive/steam_reviews.csv'
data = pd.read_csv(path)
data.head()

In [None]:
from tqdm import tqdm

In [None]:
reviews = data['review'].values
helpful = data['helpful'].values

In [None]:
reviews[0]

In [None]:
labeled_sentiment = []
idx =[]
for index , i in tqdm(enumerate(reviews[:10000])):
  try :
    labeled_sentiment.append(classify(i))
    idx.append(index)
  except:
    pass

In [None]:
final_reviews=[]
final_helpful=[]
for i in tqdm(range(len(reviews[:10000]))):
  if i in idx:
    final_reviews.append(reviews[i])
    final_helpful.append(helpful[i])

In [None]:
new_data = pd.DataFrame(data=list(zip(final_helpful,final_reviews,labeled_sentiment)),columns=['helpful','review','sentiment'])

In [None]:
reviews = new_data['review'].values
label = new_data['sentiment'].values

In [None]:
word_dict={}
val=1
for i in tqdm(reviews):
  for j in i.split():
    if j not in word_dict:
      word_dict[j]=val
      val+=1


In [None]:
word_dict['<PAD>']=0
word_dict['<START>']=val
word_dict['<UNK>'] = len(word_dict)

In [None]:
train_data =[]
for i in tqdm(reviews):
  sent = [word_dict['<START>']]
  for j in i.split():
    sent.append(word_dict[j])
  train_data.append(sent)
  

In [None]:
train_data = keras.preprocessing.sequence.pad_sequences(train_data,
                                                        value=word_dict["<PAD>"],
                                                        padding='post',
                                                        maxlen=256)

In [None]:
train_out =[]
for i in label:
  if i == 'negative':
    train_out.append(0)
  else:
    train_out.append(1)

The neural_network() method is the base architechture that we are going to use to train our model 

In [None]:
def neural_network(drop=0.2):
  model = Sequential()

  model.add(keras.layers.Embedding(len(word_dict),16))
  model.add(keras.layers.GlobalAveragePooling1D())

  model.add(keras.layers.Dense(32,activation='relu'))
  model.add(keras.layers.Dense(64,activation='relu'))
  model.add(keras.layers.Dropout(drop))

  model.add(keras.layers.Dense(1,activation='sigmoid'))

  return model

# **Harris Hawks Optimization Algorithm**

In [None]:
class HHO():
  
  def __init__(self,N,T,nfeatures,pop=[]):
    self.N=N
    self.T=T
    self.n_features = nfeatures
    if len(pop)==0:
      self.population = np.random.random((self.N,self.n_features))
    else:
      self.population=pop[:]
    self.best_fitness =0
    
    
  def optimize(self):
    
    self.UB=len(self.population)-1
    self.LB =0
    i=0
    for i in tqdm(range(self.T)):
      
      if i==0:
        print()
      fits = []
      
      for j in range(len(self.population)):
          if i !=0:
            indi_drop = self.sigmoid(self.population[j][0])
          else:
            indi_drop = self.population[j][0]
          model =neural_network(indi_drop)
          model.compile(optimizer='adam',loss='mse',metrics=['acc'])
          history = model.fit(train_data,train_out,epochs=1,batch_size=8)
          acc =history.history['acc'][0]
          fits.append(acc)
          best_fitness = max(fits)
          if acc >=best_fitness:
            index = j
            self.best_fitness = best_fitness

      
      if i!=0:
        self.rabbit = self.population[index][0]
      else:
        self.rabbit = self.sigmoid(self.population[index][0])
      print()
      print("Dropout selected from Iteration ",i+1," : ",self.rabbit)
      print()
      for t,hawk in enumerate(self.population) :
        
        E=2*np.random.rand() -1
        J = 2*(1 - np.random.rand())
        
        E = self.update_E(E,i+1)
        
        if np.abs(E) >= 1:
          self.exploration(t)
        
        if np.abs(E) < 1:
          
          r=np.random.rand()
          
          if r>=0.5 and np.abs(E)>=0.5 :
            #update vector using eqn(4)
            self.soft_baise(t,E,J)
          elif r>=0.5 and np.abs(E) <0.5:
            #update vector using eqn (6)
            self.hard_baise(t,E)
          elif r<0.5 and np.abs(E) >=0.5:
            #update vector using eqn (10)
            self.soft_baise_dive(t,E,J)
          elif r<0.5 and np.abs(E) <0.5:
            #update vector using eqn (11)
            self.hard_baise_dive(t,E,J)
           
      #print("\t Best fitness value in this iteration = ",best_fitness) 
      #print("best rabbit in this iteration : ",self.rabbit)
    
    return self.rabbit,index

  def update_E(self,E,t):
    E = 2 * E *(1 - t/self.T)
    return E
  
  def soft_baise(self,t,E,J):
    del_x = self.rabbit - self.population[t]
    
    self.population[(t+1)%self.UB+1]= del_x  - E * np.abs( J * self.rabbit - self.population[t] )
  
  def hard_baise(self,t,E):
    del_x = self.rabbit - self.population[t]
    
    self.population[(t+1)%self.UB+1] = self.rabbit - E * np.abs(del_x)
  
  def soft_baise_dive(self,t,E,J):
    Y = self.rabbit - E* np.abs(J * self.rabbit - self.population[t])
    
    s = np.random.randn(1,self.n_features)
    
    u=np.random.random()
    v= np.random.random()
    
    std = np.std(self.population[t])
    beta = 1/1.5
    
    lf= 0.01 * u * std /(np.abs(v)**beta)
    
    Z = Y + s * lf
    
    if self.fitness(Y) < self.fitness(self.population[t]):
      self.population[(t+1)%self.UB+1] = Y
    elif self.fitness(Z) < self.fitness(self.population[t]):
      self.population[(t+1)%self.UB+1] = Z
  
  def hard_baise_dive(self,t,E,J):
    avg=0  
    for i in self.population:
      avg+=i
  
    pop = len(self.population)
    avg=avg/pop
    
    
    Y = self.rabbit - E* np.abs(J*self.rabbit - avg)
    s = np.random.randn(1,self.n_features)
    
    u=np.random.random()
    v= np.random.random()
    
    std = np.std(self.population[t])
    beta = 1/1.5
    
    lf= 0.01 * u * std /(np.abs(v)**beta)
    
    Z = Y + s * lf
    
    if self.fitness(Y) < self.fitness(self.population[t]):
      self.population[(t+1)%self.UB+1] = Y
    elif self.fitness(Z) < self.fitness(self.population[t]):
      self.population[(t+1)%self.UB+1] = Z
  
  def exploration(self,t):
    q = np.random.random()
    r1= np.random.random()
    r2= np.random.random()
    r3= np.random.random()
    r4= np.random.random()
  
    avg=0  
    for i in self.population:
      avg+=i
  
    pop = len(self.population)
    avg=avg/pop
   
    k = np.random.randint(low=0,high=pop-1)
    if q >= 0.5 :
      self.population[(t+1)%pop] = self.population[k] -r1 * (self.population[k] - 2* r2 *self.population[t])
    else :
      self.population[(t+1)%pop] = (self.rabbit - avg) - r3 *( 0 + r4 * (pop-1 - 0))
    
  def fitness(self,hawk):
    sum=np.mean(hawk)
    return sum

  def sigmoid(self,x):
    return 1/(1+np.exp(-x))

In [None]:
def create_population(pop_size):
  pop=[]
  for i in range(pop_size):
    individual=[]
    dropout = np.random.random()
    individual.append(dropout)
    pop.append(individual)
  return np.array(pop,dtype='float32')

In [None]:
print(create_population(5))
pop = create_population(25)
print('length : ',len(pop))

In [None]:
opt = HHO(len(pop),2,1,pop)
optimized_dropout = opt.optimize()

In [None]:
print("Optimized Dropout : ",optimized_dropout)

#**Training Our Main Model** 

In [None]:
import keras

In [None]:
model = neural_network(0)
model.compile(loss='mse',optimizer='adam',metrics=['acc'])
h1 = model.fit(train_data,train_out,epochs=25,validation_split=0.2,batch_size=8)

In [None]:
model = neural_network(2)
model.compile(loss='mse',optimizer='adam',metrics=['acc'])
h2 = model.fit(train_data,train_out,epochs=25,validation_split=0.2,batch_size=8)

### model = neural_network(0.271245)
model.compile(loss='mse',optimizer='adam',metrics=['acc'])
h3 = model.fit(train_data,train_out,epochs=25,validation_split=0.2,batch_size=8)

In [None]:
def encode(text):
  out =[word_dict['<START>']]
  for i in text.split():
    if i in word_dict:
      out.append(word_dict[i])
    else:
      out.append(word_dict['<UNK>'])
      
  return out

In [None]:
encode('this is not good')

In [None]:
def classify(text):
  emb = keras.preprocessing.sequence.pad_sequences([encode(text)],
                                                  value=word_dict['<PAD>'],
                                                  padding='post',
                                                  maxlen=256)
  
  pred = model.predict(emb)
  
  #print("prediction obtained : ", pred)
  
  if (pred*100) >50.0 :
    #print("Positive review")
    return "positive"
  else:
    #print("Negative review")
    return "negative"
 

In [None]:
classify('bad gaME')

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
y = list(range(1,26))

print(y)
print(h1.history['acc'])

In [None]:
plt.plot(y,h1.history['val_acc'])
plt.plot(y,h2.history['val_acc'])
plt.plot(y,h3.history['val_acc'])

plt.legend(['No Dropout','Constant Dropout','With HHO'])

In [None]:
plt.boxplot([h1.history['val_acc'],h2.history['val_acc'],h3.history['val_acc']])
plt.xlabel(['No Dropout','Constant Dropout','with HHO dropout'])

In [None]:
print('No Dropout       : ',max(h1.history['val_acc']))
print('Constant Dropout : ',max(h2.history['val_acc']))
print('with HHO dropout : ',max(h3.history['val_acc']))

In [None]:
optimized_dropout