# Sentiment Analysis

In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from nltk.tokenize import word_tokenize,sent_tokenize

In [3]:
df=pd.read_csv('hepsiburada.csv')
df.head()

Unnamed: 0,Rating,Review
0,1,3 yıldır tık demedi. :)
1,1,3 yıldır kullanıyorum müthiş
2,1,Ürün bugün elime geçti çok fazla inceleme fırs...
3,1,Almaya karar verdim. Hemencecik geldi. Keyifle...
4,1,Günlük kullanımınızı çok çok iyi karsılıyor kı...


In [4]:
df['Rating'].value_counts()

1    229821
0     13676
Name: Rating, dtype: int64

# Splitting Train and Test Dataset

In [5]:
X=df['Review'].values.tolist()
y=df['Rating'].values.tolist()

In [13]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

# Tokenization

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [18]:
num_words=20000
tokenizer=Tokenizer(num_words=num_words)

In [21]:
#Tokenized all words
tokenizer.fit_on_texts(X)

In [22]:
tokenizer.word_index

{'çok': 1,
 'bir': 2,
 've': 3,
 'ürün': 4,
 'bu': 5,
 'iyi': 6,
 'güzel': 7,
 'için': 8,
 'tavsiye': 9,
 'ederim': 10,
 'daha': 11,
 'ama': 12,
 'da': 13,
 'gayet': 14,
 'hızlı': 15,
 'teşekkürler': 16,
 'aldım': 17,
 'de': 18,
 'ürünü': 19,
 'gibi': 20,
 'yok': 21,
 'uygun': 22,
 'olarak': 23,
 'kaliteli': 24,
 'en': 25,
 '2': 26,
 'kargo': 27,
 'fiyat': 28,
 'elime': 29,
 'kadar': 30,
 'ile': 31,
 'göre': 32,
 'geldi': 33,
 'var': 34,
 'hepsiburada': 35,
 'ben': 36,
 'gerçekten': 37,
 '1': 38,
 'fiyata': 39,
 'gün': 40,
 'sonra': 41,
 'cok': 42,
 'kesinlikle': 43,
 'telefon': 44,
 'biraz': 45,
 'hiç': 46,
 'ulaştı': 47,
 'memnun': 48,
 'hem': 49,
 'değil': 50,
 'kullanışlı': 51,
 '3': 52,
 'mükemmel': 53,
 'oldu': 54,
 'kullanıyorum': 55,
 'önce': 56,
 'sipariş': 57,
 'tek': 58,
 'her': 59,
 'bence': 60,
 'harika': 61,
 'kalitesi': 62,
 'bi': 63,
 'ayrıca': 64,
 '5': 65,
 'teşekkür': 66,
 'fiyatı': 67,
 'olması': 68,
 'ne': 69,
 'herkese': 70,
 'bile': 71,
 'uzun': 72,
 'süper': 73,

In [33]:
#To tokenize the train and test set:
X_train_tokens=tokenizer.texts_to_sequences(X_train)
X_test_tokens=tokenizer.texts_to_sequences(X_test)

In [34]:
X_train[25]

'saat şık duruyor. gelir gelmez pil takıp ayarladım. 1 saat sonra kendi kendine durmuş. bu fiyata zaten 1 saat çalışması bile mucize. yinede duvarda güzel duruyor :)'

In [35]:
len(X_train_tokens[25])

26

In [36]:
len(word_tokenize(X_train[25]))

32

* As you can see,the number of word is not equal to number of tokens.Because we just choose 20000 words to construct a model.Out of 20000 words was not be tokenized

In [37]:
num_tokens=[len(token) for token in X_train_tokens+X_test_tokens]
num_tokens=np.array(num_tokens)

In [38]:
num_tokens

array([21, 25, 17, ..., 19, 41, 21])

In [40]:
optimum_num_token=int(np.mean(num_tokens)+2*np.std(num_tokens))
optimum_num_token

63

# Padding

* To feed the model,we adjust the shape of the dataset.All number of token of sentences have to be equal.

In [47]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [49]:
X_train_pad=pad_sequences(X_train_tokens,maxlen=optimum_num_token)
X_test_pad=pad_sequences(X_test_tokens,maxlen=optimum_num_token)

In [51]:
print(X_train_pad.shape,X_test_pad.shape)

(194797, 63) (48700, 63)


In [53]:
index=tokenizer.word_index
word_map=dict(zip(index.values(),index.keys()))

def encode(tokens):
    words=[word_map[token] for token in tokens if token!=0]
    text=' '.join(words)
    return text

In [55]:
X_train[50]

'Ürün ertesi günü elime ulaştı. Bir sorunu yok. Hatta yanında iki adet esans göndermişler bu jestleri için de ayrıca  teşekkürler :)'

In [57]:
encode(X_train_tokens[50])

'ürün ertesi günü elime ulaştı bir sorunu yok hatta yanında iki adet esans göndermişler bu için de ayrıca teşekkürler'

# Construct Model

In [59]:
#We don't use RNN model because it will cause exploding/vanishing gradient.We'll use GRU to solve this problem.

model=tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=num_words,
                                   output_dim=50,
                                   input_length=optimum_num_token,
                                   name='embedding_layer'))
model.add(tf.keras.layers.GRU(units=32,return_sequences=True))
model.add(tf.keras.layers.GRU(units=32,return_sequences=True))
model.add(tf.keras.layers.GRU(units=16,return_sequences=False))
model.add(tf.keras.layers.Dense(1,activation=tf.nn.sigmoid))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_layer (Embedding)  (None, 63, 50)           1000000   
                                                                 
 gru (GRU)                   (None, 63, 32)            8064      
                                                                 
 gru_1 (GRU)                 (None, 63, 32)            6336      
                                                                 
 gru_2 (GRU)                 (None, 16)                2400      
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
Total params: 1,016,817
Trainable params: 1,016,817
Non-trainable params: 0
_________________________________________________________________


In [60]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),loss='binary_crossentropy',metrics=['accuracy'])

In [66]:
y_train=np.array(y_train)
y_test=np.array(y_test)

In [68]:
history=model.fit(X_train_pad,y_train,batch_size=128,epochs=10,validation_data=(X_test_pad,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [69]:
results=pd.DataFrame(history.history)
results

Unnamed: 0,loss,accuracy,val_loss,val_accuracy
0,0.165342,0.948659,0.113469,0.961109
1,0.097355,0.966047,0.103043,0.964312
2,0.080667,0.972212,0.101959,0.96577
3,0.071235,0.976073,0.104044,0.965154
4,0.064343,0.979112,0.110034,0.96616
5,0.059198,0.981442,0.114918,0.965606
6,0.055045,0.983141,0.11888,0.964538
7,0.051516,0.984527,0.122654,0.963285
8,0.048552,0.985595,0.12841,0.96384
9,0.045677,0.98711,0.133649,0.962238


## Prediction

In [152]:
pred_text=[]

text='Fiyatına değmez ürün indirimden aldım. Sensörü sorunlu telefonla konuşurken sıkıntı çıkarıyor. Bildirimlerinde yazılımsal sorun var herşeye ötüyor.'
pred_text.append(text)
pred_text

['Fiyatına değmez ürün indirimden aldım. Sensörü sorunlu telefonla konuşurken sıkıntı çıkarıyor. Bildirimlerinde yazılımsal sorun var herşeye ötüyor.']

In [153]:
pred_token=tokenizer.texts_to_sequences(pred_text)
pred_token_pad=pad_sequences(pred_token,maxlen=optimum_num_token)
pred_token_pad

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,   86, 2596,    4, 3670,   17, 3026, 4095,
        1162, 4610,  147,  928, 5276,  105,   34, 4957]])

In [158]:
prediction=model.predict(pred_token_pad)[0][0]
if prediction>0.5:
    print('Positive comment,Score : {}'.format(prediction))
else:
    print('Negative comment,Score : {}'.format(prediction))

Negative comment,Score : 0.2874963879585266
