In [0]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from tensorflow.keras.initializers import Constant
from tensorflow.keras import optimizers

In [0]:
from google.colab import drive
drive.mount('/gdrive')

#KERAS

In [0]:
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences

## prepare dataset

In [0]:
df = pd.read_csv("sentiment.csv")
df.head()

In [0]:
def get_word2id(df):
  vocab=set()
  for i in df.iteritems():
    words= i[1].split(' ')
    vocab.update(words)
  word2id= dict()
  id2word= dict()
  for i,word in enumerate(vocab):
    word2id[word]=i+1
    id2word[i+1] =word
  return word2id,id2word,vocab

In [0]:
def convert_data(df,word2id):
  new_dataset=[]
  for r in df.iteritems():
    new_dataset.append([word2id[w] for w in r[1].split(" ")])
  return new_dataset

In [0]:
word2id,id2word,vocab = get_word2id(df['sentence'])
new_dataset = convert_data(df['sentence'],word2id)

In [0]:
vocab_size = len(word2id)

In [0]:
new_dataset = pad_sequences(new_dataset, maxlen=128, dtype='int32', padding='post', truncating='post', value=0.0)


In [0]:
new_dataset = np.concatenate([new_dataset,df['polarity'].values.reshape(-1,1)],axis=1)

### split train , test

In [0]:
train = new_dataset[:int(len(new_dataset)*.8)]
test = new_dataset[int(len(new_dataset)*.8):]

### evaluation

In [0]:
def evaluate(y_true,y_pred):
  return confusion_matrix(y_true, y_pred, labels=[0,1])

In [0]:
from tensorflow import keras
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size,16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16,activation=tf.nn.relu))
model.add(keras.layers.Dense(16,activation=tf.nn.relu))
model.add(keras.layers.Dense(2,activation=tf.nn.softmax))
model.summary()

In [0]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.fit(x = train[:,:-1],y=train[:,-1],batch_size=32,epochs=5,validation_split=0.2)

## Sentiment Analysis with LSTM and pre-trained word embeddings (Glove)

### download and prepare embedding

In [0]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2019-12-23 22:42:36--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-12-23 22:42:36--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2019-12-23 22:42:36--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.2’

glove

In [0]:
!unzip glove.6B.zip.1

Archive:  glove.6B.zip.1
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [0]:
import csv
import pandas as pd
with open('glove.6B.50d.txt') as f:
    rows, cols = 400000,50

vectorsdf = pd.read_csv(
    'glove.6B.50d.txt', sep=' ',  header=None, index_col=0,
    quoting=csv.QUOTE_NONE, encoding='utf-8')

# remove one junk column
vectorsdf = vectorsdf.dropna(axis=1)
assert vectorsdf.shape == (int(rows), int(cols))
vectorsdf.head()


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
the,0.418,0.24968,-0.41242,0.1217,0.34527,-0.044457,-0.49688,-0.17862,-0.00066,-0.6566,0.27843,-0.14767,-0.55677,0.14658,-0.00951,0.011658,0.10204,-0.12792,-0.8443,-0.12181,-0.016801,-0.33279,-0.1552,-0.23131,-0.19181,-1.8823,-0.76746,0.099051,-0.42125,-0.19526,4.0071,-0.18594,-0.52287,-0.31681,0.000592,0.007445,0.17778,-0.15897,0.012041,-0.054223,-0.29871,-0.15749,-0.34758,-0.045637,-0.44251,0.18785,0.002785,-0.18411,-0.11514,-0.78581
",",0.013441,0.23682,-0.16899,0.40951,0.63812,0.47709,-0.42852,-0.55641,-0.364,-0.23938,0.13001,-0.063734,-0.39575,-0.48162,0.23291,0.090201,-0.13324,0.078639,-0.41634,-0.15428,0.10068,0.48891,0.31226,-0.1252,-0.037512,-1.5179,0.12612,-0.02442,-0.042961,-0.28351,3.5416,-0.11956,-0.014533,-0.1499,0.21864,-0.33412,-0.13872,0.31806,0.70358,0.44858,-0.080262,0.63003,0.32111,-0.46765,0.22786,0.36034,-0.37818,-0.56657,0.044691,0.30392
.,0.15164,0.30177,-0.16763,0.17684,0.31719,0.33973,-0.43478,-0.31086,-0.44999,-0.29486,0.16608,0.11963,-0.41328,-0.42353,0.59868,0.28825,-0.11547,-0.041848,-0.67989,-0.25063,0.18472,0.086876,0.46582,0.015035,0.043474,-1.4671,-0.30384,-0.023441,0.30589,-0.21785,3.746,0.004228,-0.18436,-0.46209,0.098329,-0.11907,0.23919,0.1161,0.41705,0.056763,-6.4e-05,0.068987,0.087939,-0.10285,-0.13931,0.22314,-0.080803,-0.35652,0.016413,0.10216
of,0.70853,0.57088,-0.4716,0.18048,0.54449,0.72603,0.18157,-0.52393,0.10381,-0.17566,0.078852,-0.36216,-0.11829,-0.83336,0.11917,-0.16605,0.061555,-0.012719,-0.56623,0.013616,0.22851,-0.14396,-0.067549,-0.38157,-0.23698,-1.7037,-0.86692,-0.26704,-0.2589,0.1767,3.8676,-0.1613,-0.13273,-0.68881,0.18444,0.005246,-0.33874,-0.078956,0.24185,0.36576,-0.34727,0.28483,0.075693,-0.062178,-0.38988,0.22902,-0.21617,-0.22562,-0.093918,-0.80375
to,0.68047,-0.039263,0.30186,-0.17792,0.42962,0.032246,-0.41376,0.13228,-0.29847,-0.085253,0.17118,0.22419,-0.10046,-0.43653,0.33418,0.67846,0.057204,-0.34448,-0.42785,-0.43275,0.55963,0.10032,0.18677,-0.26854,0.037334,-2.0932,0.22171,-0.39868,0.20912,-0.55725,3.8826,0.47466,-0.95658,-0.37788,0.20869,-0.32752,0.12751,0.088359,0.16351,-0.21634,-0.094375,0.018324,0.21048,-0.03088,-0.19722,0.082279,-0.09434,-0.073297,-0.064699,-0.26044


### remove words which are not in embeddings


In [0]:
cvocab =vocab.copy()
for v in cvocab:
  if v not in vectorsdf.index:
    vocab.remove(v)

### handle UNKOWN words 

In [0]:
word_to_ids={}
vocab_vectors=[]
e_dim = 50

# for i,v in enumerate(vocab):
#   vocab_vectors.append(vectorsdf.loc[v].values.tolist())
#   word_to_ids[v]=i
for i,v in enumerate(vectorsdf.iterrows()):
  word_to_ids[v[0]]=i

vocab_vectors=vectorsdf.values

vocab_vectors=np.array(vocab_vectors)
word_to_ids["<UNK>"]=len(vocab)
word_to_ids["<PAD>"]=0
vocab_vectors = np.concatenate([vocab_vectors,np.ones((1,e_dim))],axis=0)
vocab_vectors=np.concatenate([vocab_vectors,np.ones((1,e_dim))*20],axis=0)

### prepare data

In [0]:
def convert_data(df,word2id):
  new_dataset=[]
  for r in df.iteritems():
      s=[]
      for w in r[1].split(" "):
        if w in word2id:
          s.append(word2id[w])
        else:
          s.append(word2id['<UNK>'])
      new_dataset.append(s)    
  return new_dataset

In [0]:
new_dataset = convert_data(df['sentence'],word_to_ids)
new_dataset = pad_sequences(new_dataset, maxlen=128, dtype='int32', padding='post', truncating='post', value=0.0)
new_dataset = np.concatenate([new_dataset,df['polarity'].values.reshape(-1,1)],axis=1)

In [0]:
train = new_dataset[:int(len(new_dataset)*.8)]
test = new_dataset[int(len(new_dataset)*.8):]

### build model

In [0]:
from tensorflow import keras
model = keras.Sequential()

model.add(keras.layers.Embedding(vocab_vectors.shape[0],
                            50,
                            embeddings_initializer=Constant(vocab_vectors),
                            input_length=128,
                            trainable=True))

model.add(keras.layers.LSTM(32,activation=tf.nn.tanh))
model.add(keras.layers.Dense(2,activation=tf.nn.softmax))
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 128, 50)           20000100  
_________________________________________________________________
lstm_8 (LSTM)                (None, 32)                10624     
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 66        
Total params: 20,010,790
Trainable params: 20,010,790
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.compile(optimizer="adam",
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.fit(x = train[:,:-1],y=train[:,-1],batch_size=4000,epochs=30,validation_split=0.2)

Train on 32000 samples, validate on 8000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f27a01aa208>

In [0]:
predicts = model.predict(train[:,:-1])
evaluate(train[:,-1],predicts)

array([51475,  1645, 39211, 51475, 51475, 51021, 26773,  7784, 40901,
       33471, 46286, 24964, 51475, 51475, 11268, 51475, 11268, 51475,
       51475, 51475, 11268, 33514, 47844, 39002, 27407, 33514, 43300,
       42118, 17699, 24565, 43877, 46395, 12199, 51475, 49453, 51475,
       51475, 26608, 45030, 32702, 50970, 36536, 26562,  5672, 23960,
       31028, 41316, 45137, 20395, 11268, 45137,  2726, 51475, 51475,
       51475, 45137, 51037, 51417, 51475, 11268, 51475,  1519,  4962,
       40392, 40555, 50970, 51475, 51475, 51475, 23740,  7784, 10113,
       43877, 11893, 23170,  6598, 51475, 51475, 51475, 50970, 51475,
        7784, 28580, 23170, 17699, 51475, 51475, 51475,  8521, 43877,
       17699, 10682, 42658, 23170, 17699, 51475, 51475, 51475, 51475,
       51475, 51475, 51475,  8830, 51475, 21227, 39901, 12329,  8296,
       42656, 50970, 36536, 51475, 51475, 51475, 45137, 43454, 51475,
       51475, 51475, 51475,  1927, 51058, 17383, 11582, 28058, 51475,
       51475, 51475]