<a href="https://colab.research.google.com/github/shivu117/semantic/blob/main/Sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/gdrive/',force_remount=True)

Mounted at /content/gdrive/


In [3]:
path = '/content/gdrive/My Drive/sentiment/'

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from gensim.models import Word2Vec

In [5]:
data = pd.read_csv(path+'airline_sentiment_analysis.csv')

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,airline_sentiment,text
0,1,positive,@VirginAmerica plus you've added commercials t...
1,3,negative,@VirginAmerica it's really aggressive to blast...
2,4,negative,@VirginAmerica and it's a really big bad thing...
3,5,negative,@VirginAmerica seriously would pay $30 a fligh...
4,6,positive,"@VirginAmerica yes, nearly every time I fly VX..."


In [7]:
X = data['text'].values

In [8]:
data['airline_sentiment'].values [ data['airline_sentiment']=='positive'] = 1

In [9]:
data['airline_sentiment'].values [ data['airline_sentiment']=='negative'] = 0

In [10]:
Y = data['airline_sentiment'].values

In [11]:
Y  = np.array(Y,dtype='int32')

In [12]:
Y.shape

(11541,)

In [13]:
Y.dtype

dtype('int32')

In [14]:
X.shape

(11541,)

In [15]:
X[0]

"@VirginAmerica plus you've added commercials to the experience... tacky."

In [16]:
def word_to_num(dat):
  vocab = []
  for i in range(len(dat)):
    dat[i] = '<BOS> '+ dat[i] +' <EOS>'
    vocab = list(set( vocab + list(set(dat[i].split())) ) )  
  vocab+= ' '    
  temp1 = []  
  for i in dat:
    temp = []
    for j in i.split():
      temp.append( vocab.index(j))
    temp1.append(temp)

  return np.array(temp1),np.array(vocab)    

In [17]:
X_data,vocab = word_to_num(X)

  


In [18]:
X_data.shape

(11541,)

In [19]:
X_data[0],vocab[:10]

([16419, 2720, 11628, 6974, 5034, 14068, 12692, 8457, 8795, 1870, 23791],
 array(['Doubt', '49min', 'affiliated.', 'screws', 'MSY.',
        'will...sunday!', '"sincere"', 'soooo', 'issue?', '#lost'],
       dtype='<U53'))

In [20]:
X[100]

"<BOS> .@VirginAmerica I don't understand why you need a DM to give me an answer on if you have a damaged luggage policy. <EOS>"

In [21]:
vocab[X_data[100]]

array(['<BOS>', '.@VirginAmerica', 'I', "don't", 'understand', 'why',
       'you', 'need', 'a', 'DM', 'to', 'give', 'me', 'an', 'answer', 'on',
       'if', 'you', 'have', 'a', 'damaged', 'luggage', 'policy.', '<EOS>'],
      dtype='<U53')

In [22]:
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_data,padding='post',truncating='post',value=np.where(vocab==' ') )

In [23]:
X_train.shape

(11541, 38)

In [24]:
vocab[X_train[0]]

array(['<BOS>', '@VirginAmerica', 'plus', "you've", 'added',
       'commercials', 'to', 'the', 'experience...', 'tacky.', '<EOS>',
       ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
       ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
       ' '], dtype='<U53')

In [25]:
def create_embed(vocabs):
     word2vec = Word2Vec(vocabs,size=300)
     embeddings = np.random.randn(len(vocabs),300)
     for i in range(len(vocabs)):
       if vocabs[i] in word2vec.wv.vocab:
         embeddings[i] = word2vec.wv.word_vec(vocabs[i])
     return embeddings    


In [26]:
embeddings = create_embed(vocab)

In [27]:
embeddings.shape

(25317, 300)

In [28]:
def stack_LSTM_model(dt1,embeddings=embeddings,batch_size=16):
  layer2_1 = tf.keras.layers.Embedding(embeddings.shape[0],embeddings.shape[1],weights=[embeddings],batch_input_shape=[batch_size,None],trainable =False)

  layer2 = tf.keras.layers.LSTM(512, return_sequences=True, recurrent_initializer='glorot_uniform',recurrent_activation='sigmoid',stateful=True)
  layer3 = tf.keras.layers.LSTM(512, return_sequences=True, recurrent_initializer='glorot_uniform',recurrent_activation='sigmoid',stateful=True)
  layer4 = tf.keras.layers.Dense(150,activation = 'sigmoid')
  layer5 = tf.keras.layers.Dense(1,activation = 'sigmoid')
  
  layer1 = tf.keras.Input(shape=(None,),batch_size=batch_size)

  out1 = layer2_1(layer1)
  out1 = layer2(out1)
  out2 = layer3(out1)

  out3 = tf.keras.layers.GlobalMaxPooling1D()(out2)

  out3 = layer4(out3)

  out4 = layer5(out3)
  return tf.keras.models.Model(inputs=layer1,outputs=out4)




In [29]:
model = stack_LSTM_model(X_train[:150])

In [30]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(16, None)]              0         
_________________________________________________________________
embedding (Embedding)        (16, None, 300)           7595100   
_________________________________________________________________
lstm (LSTM)                  (16, None, 512)           1665024   
_________________________________________________________________
lstm_1 (LSTM)                (16, None, 512)           2099200   
_________________________________________________________________
global_max_pooling1d (Global (16, 512)                 0         
_________________________________________________________________
dense (Dense)                (16, 150)                 76950     
_________________________________________________________________
dense_1 (Dense)              (16, 1)                   151   

In [31]:
model.compile('adam','binary_crossentropy','accuracy')

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
 X_train, X_test, y_train, y_test = train_test_split(X_train,Y, test_size=0.20, random_state=42)

In [34]:
 X_train.shape, X_test.shape, y_train.shape, y_test.shape

((9232, 38), (2309, 38), (9232,), (2309,))

In [34]:
np.save(path+'X_train.npy',X_train)

In [35]:
np.save(path+'X_test.npy',X_test)

In [36]:
np.save(path+'Y_test.npy',y_test)

In [37]:
np.save(path+'Y_train.npy',y_train)

In [35]:
hist = model.fit(x=X_train,y=y_train,batch_size=16,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [36]:
model.evaluate(X_test[:2304],y_test[:2304],batch_size=16)



[0.5406525135040283, 0.8932291865348816]

In [37]:
model.save_weights(path+'Stack_LSTM_model.h5')

In [38]:
def LSTM_model(dt1,embeddings=embeddings,batch_size=16):
  layer2_1 = tf.keras.layers.Embedding(embeddings.shape[0],embeddings.shape[1],weights=[embeddings],batch_input_shape=[batch_size,None],trainable =False)

  layer2 = tf.keras.layers.LSTM(512, return_sequences=True, recurrent_initializer='glorot_uniform',recurrent_activation='sigmoid',stateful=True)
  
  layer4 = tf.keras.layers.Dense(150,activation = 'sigmoid')
  layer5 = tf.keras.layers.Dense(1,activation = 'sigmoid')
  
  layer1 = tf.keras.Input(shape=(None,),batch_size=batch_size)

  out1 = layer2_1(layer1)
  out2 = layer2(out1)

  out3 = tf.keras.layers.GlobalMaxPooling1D()(out2)

  out3 = layer4(out3)

  out4 = layer5(out3)
  return tf.keras.models.Model(inputs=layer1,outputs=out4)




In [39]:
model = LSTM_model(X_train[:150])

In [40]:
model.compile('adam','binary_crossentropy','accuracy')

In [41]:
hist = model.fit(x=X_train,y=y_train,batch_size=16,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [42]:
model.evaluate(X_test[:2304],y_test[:2304],batch_size=16)



[0.5209348201751709, 0.8910590410232544]

In [43]:
model.save_weights(path+'LSTM_model.h5')