## Word2vec embedding using gensim

We learn an embedding using gensim's word2vec model and use it for sentiment classification.

In [9]:
import pandas as pd
import numpy as np
data_file = "/home/deepak/data/fire/tamil_sentiment_full_train.tsv"

train_df = pd.read_csv(data_file,sep='\t')

train_df.head()

Unnamed: 0,text,category
0,Vani bhojam fans hit like solli 500 like Vangi...,unknown_state
1,I love you ajith very I like,Positive
2,ennaya trailer Ku mudi Ellam nikkudhu... Vera ...,Positive
3,Vijay Annaa Ur Maassssss Therrrrriiiiii,Positive
4,நம்ப நடே நாசாமா தான் போச்சி,Negative


## Learn embedding

In [10]:
documents = train_df.text.to_list()

from gensim.test.utils import datapath
from gensim import utils

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        #corpus_path = datapath('lee_background.cor')
        for document in documents:
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(document)

import gensim.models

sentences = MyCorpus()
model = gensim.models.Word2Vec(sentences=sentences)

In [11]:
## words that are similar
model.wv.most_similar('trailer')

[('full', 0.9164316058158875),
 ('bigil', 0.9144452810287476),
 ('diwali', 0.911159098148346),
 ('kaithi', 0.9102253913879395),
 ('asuran', 0.9063699245452881),
 ('mersal', 0.9054629802703857),
 ('varum', 0.8990466594696045),
 ('pathutu', 0.8990169167518616),
 ('nkp', 0.8978241086006165),
 ('kandipa', 0.89558345079422)]

In [12]:
train_df['category'].value_counts()

Positive          20069
unknown_state      5628
Negative           4271
Mixed_feelings     4020
not-Tamil          1667
Positive              1
Name: category, dtype: int64

In [39]:
labels2number = {"Positive":0, "unknown_state":1,"Negative":2,"Mixed_feelings":3,"not-Tamil":4}
numbers2label = {0:"Positive",1:"unknown_state",2:"Negative",3:"Mixed-feelings",4:"not-tamil"}

In [14]:
def encode_labels(row):
    try:
        label = labels2number[row['category']]
    except:
        label = 5
    return label

train_df['label']= train_df.apply(encode_labels,axis=1)

In [15]:
train_df.label.value_counts()

0    20069
1     5628
2     4271
3     4020
4     1667
5        1
Name: label, dtype: int64

In [16]:
texts= train_df.text
labels = train_df.label

In [17]:
import numpy as np
text = texts[3]
text

'Vijay Annaa  Ur Maassssss Therrrrriiiiii'

In [18]:

embedding_dim = 100
def get_embedding(text):
    '''Given a sentence compute an embedding by computing average of all word
    embedding.'''
    embedding = np.zeros(embedding_dim)
    embedding_list = []
    for word in text.split(' '):
        word = word.lower()
        #print(word)
        try:
            wv_embedding = model.wv[word]
            embedding_list.append(wv_embedding)
        except:
            pass
    if len(embedding_list):   
        embedding = np.mean(embedding_list,axis=0)
   
    return embedding

In [19]:
text_embedding = np.asarray([get_embedding(text) for text in texts])

In [20]:
text_embedding.shape

(35656, 100)

In [21]:
from keras.layers import  Input,Dense
from keras import Model,Sequential
import keras

In [22]:
classifier = Sequential()
classifier.add(Input(shape=(100,)))
classifier.add(Dense(32,  activation = 'relu'))
classifier.add(Dense(32, activation = 'relu'))
classifier.add(Dense(6, activation = 'softmax'))


In [23]:
labels_ = keras.utils.to_categorical(labels)
labels_.shape

(35656, 6)

In [24]:
classifier.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [25]:
classifier.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                3232      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 198       
Total params: 4,486
Trainable params: 4,486
Non-trainable params: 0
_________________________________________________________________


In [26]:
classifier.fit(text_embedding,labels_,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f02a0380bd0>

In [30]:
data_file = "/home/deepak/data/fire/tamil_sentiment_full_dev.tsv"

valid_df = pd.read_csv(data_file,sep='\t')

valid_df.head()

Unnamed: 0,text,category
0,@0:40 songsuperb..kamallllla.. I'm waiting fo...,unknown_state
1,கணத்ததோர் அகமுடையார் சார்பாக படம் வெற்றி அடைய...,Positive
2,Thalavia neenga veera level boss and neega tha...,Positive
3,Oru padam patha fell.vera level music,Positive
4,Hairstyle than mattama iruku. Adhu mattum math...,unknown_state


In [34]:
valid_df['label']= valid_df.apply(encode_labels,axis=1)

valid_inputs = valid_df.text.to_list()

valid_embedding = np.asarray([get_embedding(text) for text in valid_inputs])

y_pred = classifier.predict(valid_embedding)

y_true = valid_df.label

In [35]:
from sklearn.metrics import classification_report

In [36]:
y_predicted = np.argmax(y_pred,axis=1)
labels=["Positive","unknown state","negative","Mixed feelings","not-tamil"]
print(classification_report(y_true,y_predicted,target_names=labels))

                precision    recall  f1-score   support

      Positive       0.61      0.97      0.75      2257
 unknown state       0.59      0.12      0.20       611
      negative       0.45      0.14      0.21       480
Mixed feelings       0.00      0.00      0.00       438
     not-tamil       0.73      0.38      0.50       176

      accuracy                           0.60      3962
     macro avg       0.47      0.32      0.33      3962
  weighted avg       0.52      0.60      0.50      3962



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
data_file = "/home/deepak/mlgym/fire/tamil_sentiment_full_test_withoutlabels.tsv"
test_df = pd.read_csv(data_file,sep='\t')

test_df

Unnamed: 0,id,text
0,Tam_1,வீர செங்குந்தர் சார்பாக இந்த திரைப்படம் வெற்றி...
1,Tam_2,Teruk ah irukku .... mokke movie .. waste of time
2,Tam_3,manitha samuthaayam amaipil irunthu intha pada...
3,Tam_4,JJ mam we miss u
4,Tam_5,Subtitle me traller dekhne wale like karo
...,...,...
4397,Tam_4398,Ithukum dislike potta kammanattti koovaingalam...
4398,Tam_4399,Suyama Sinthikiravan than super Hero Seama dia...
4399,Tam_4400,Super thalaiva.... Nee mass dha eppavume
4400,Tam_4401,பெண்ணை அடிமையாக்க நினைக்கும் இந்த படம் தோல்வித...


In [46]:


test_inputs = test_df.text.to_list()

test_embedding = np.asarray([get_embedding(text) for text in test_inputs])

y_pred = classifier.predict(test_embedding)
y_predicted = np.argmax(y_pred,axis=1)
predicted_label = [numbers2label[y_pred] for y_pred in y_predicted]
test_df['prediction'] = predicted_label

In [50]:
test_df

Unnamed: 0,id,text,prediction
0,Tam_1,வீர செங்குந்தர் சார்பாக இந்த திரைப்படம் வெற்றி...,Positive
1,Tam_2,Teruk ah irukku .... mokke movie .. waste of time,Positive
2,Tam_3,manitha samuthaayam amaipil irunthu intha pada...,Positive
3,Tam_4,JJ mam we miss u,Positive
4,Tam_5,Subtitle me traller dekhne wale like karo,not-tamil
...,...,...,...
4397,Tam_4398,Ithukum dislike potta kammanattti koovaingalam...,Negative
4398,Tam_4399,Suyama Sinthikiravan than super Hero Seama dia...,Positive
4399,Tam_4400,Super thalaiva.... Nee mass dha eppavume,Positive
4400,Tam_4401,பெண்ணை அடிமையாக்க நினைக்கும் இந்த படம் தோல்வித...,Positive
