## sentiment analys - word2vec feature weighting with RNN classification

`Term Weighting = Word2Vec`

## Libraries

In [1]:
import pandas as pd
import numpy as np
import numpy as np 
import re #RegEx
import itertools
import sklearn

import gensim
from gensim.models import Word2Vec

from nltk.tokenize import word_tokenize

## Load Dataset

_`preprocessed dataset`_

In [2]:
df = pd.read_csv('D:\kuliah\THE ONLY TA THINGS\DATA\cleaned_data_review.csv')
df.head()

Unnamed: 0,content,stemming,label
0,bagus,['bagus'],-1.0
1,Tampa tik tok hidub terasa sepiðŸ¤£ðŸ¤£,"['tampa', 'hidub', 'sepi', '', '']",1.0
2,Goid,['goid'],1.0
3,sangat bagus,['bagus'],1.0
4,P im ZZ@@@,"['im', 'zz']",1.0


In [3]:
df = df.dropna()
df = df.reset_index(drop=True)

In [4]:
df.isnull().sum()

content     0
stemming    0
label       0
dtype: int64

_`total label value`_

In [5]:
df['label'].value_counts()

 1.0    196
-1.0     76
 0.0      9
Name: label, dtype: int64

## Word2Vec for feature weighting

_`Split data test and train before`_

In [6]:
X = df.drop(['content'], axis = 1) 
y = df['label']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
X_train

Unnamed: 0,stemming,label
55,[],1.0
196,[],1.0
216,[],1.0
38,['bagus'],-1.0
241,['komen'],1.0
...,...,...
188,"['kanapa', 'minimal', 'fitur', 'posting', 'ula...",-1.0
71,"['aplikasi', 'baik', 'banget', 'pokok', '', ''...",1.0
106,"['logo', 'capcut', 'tiktok', 'baik']",-1.0
270,['bnget'],-1.0


_`remove null ('[]') value`_

In [8]:
X_train = df[df.stemming != '[]']
X_train


Unnamed: 0,content,stemming,label
0,bagus,['bagus'],-1.0
1,Tampa tik tok hidub terasa sepiðŸ¤£ðŸ¤£,"['tampa', 'hidub', 'sepi', '', '']",1.0
2,Goid,['goid'],1.0
3,sangat bagus,['bagus'],1.0
4,P im ZZ@@@,"['im', 'zz']",1.0
...,...,...,...
275,Sangat recommended,['recommended'],1.0
276,Baguss bagusss abguss baguss bagusss bagsusus ...,"['baguss', 'bagusss', 'abguss', 'baguss', 'bag...",1.0
277,Saya suka aplikasi tik tok ini. Saya suka bang...,"['suka', 'aplikasi', 'suka', 'banget', 'adik',...",1.0
278,"Halo developer, saya pengguna tiktok dengan na...","['developer', 'guna', 'tiktok', 'nama', 'epw',...",-1.0


_`Define function for word2vec weighting`_

In [9]:
def mean(z): 
    return sum(itertools.chain(z))/len(z)

def wordTokenize(text):
  return word_tokenize(text)


words = X_train['stemming'].apply(wordTokenize)
w2v_model = gensim.models.Word2Vec(words, min_count = 2, window = 5) 



def embeddToWord2Vec(text):
    result = [w2v_model.wv[w] for w in text if w in w2v_model.wv.key_to_index]
    feature = [mean(x) for x in zip(*result)]
    return feature

_`Weighting each dataset (training, test)`_

In [10]:
word2vec_training_features = X_train['stemming'].apply(embeddToWord2Vec)
word2vec_test_features = X_test['stemming'].apply(embeddToWord2Vec)

feature = [x for x in word2vec_training_features.transpose()]
word2vec_training_features = np.asarray(feature)

feature = [x for x in word2vec_test_features.transpose()]
word2vec_test_features = np.asarray(feature)


print(word2vec_training_features)

[[-0.25684804  0.19792743  0.26529769 ... -0.13194032 -0.03489628
   0.17373688]
 [-0.31209823  0.24278777  0.33058439 ... -0.16465136 -0.0395188
   0.21582996]
 [-0.25684804  0.19792743  0.26529769 ... -0.13194032 -0.03489628
   0.17373688]
 ...
 [-0.31500613  0.24514884  0.33402053 ... -0.166373   -0.03976209
   0.21804539]
 [-0.32630542  0.25432329  0.34737239 ... -0.17306278 -0.04070745
   0.2266539 ]
 [-0.31712097  0.24686598  0.33651954 ... -0.1676251  -0.03993903
   0.21965661]]


In [11]:
print(word2vec_training_features.shape)

(257, 100)


In [12]:
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(word2vec_training_features.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:


NameError: name 'val_x' is not defined