In [52]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
ROOT_DIR = "/content/drive/MyDrive/Colab Notebooks/"
RES_DIR = ROOT_DIR + "resources/"
MOD_DIR = ROOT_DIR + "ml_models/"
BATCH_SIZE = 32

X = pd.read_csv(RES_DIR + "X_NEG4.csv")["Sysnet"]
y = pd.read_csv(RES_DIR + "y_NEG4.csv")["NEG"]


In [53]:
BUFFER_SIZE = 1000
BATCH_SIZE = 128


In [54]:
X_pom, X_val, y_pom, y_val = train_test_split(X, y, test_size=.2, stratify=y, )
X_train, X_test, y_train, y_test = train_test_split(X_pom, y_pom, test_size=.1, stratify=y_pom, )

X_val = tf.convert_to_tensor(X_val, name ="Definicija")
y_val = tf.convert_to_tensor(y_val, name ="Sentiment")


X_train = tf.convert_to_tensor(X_train, name ="Definicija")
y_train = tf.convert_to_tensor(y_train, name ="Sentiment")

X_test = tf.convert_to_tensor(X_test, name ="Definicija")
y_test = tf.convert_to_tensor(y_test, name ="Sentiment")

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
validation_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))


In [55]:
tf.math.confusion_matrix(labels=y_train, predictions=y_train).numpy()


array([[14230,     0],
       [    0,   234]], dtype=int32)

In [56]:
train_dataset =train_dataset.map(lambda x_text, x_label: (x_text, tf.expand_dims(x_label, -1)))
validation_dataset= validation_dataset.map(lambda x_text, x_label: (x_text, tf.expand_dims(x_label, -1)))

In [57]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
validation_dataset = validation_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [58]:
tf.shape(X_train)

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([14464], dtype=int32)>

In [59]:
import tensorflow_datasets as tfds
import numpy as np


In [60]:
VOCAB_SIZE = 25000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE, output_mode="int" )
encoder.adapt(train_dataset.map(lambda text, label: text))



In [61]:
vocab = np.array(encoder.get_vocabulary())
vocab[:30]


array(['', '[UNK]', 'koji', 'se', 'i', 'u', 'ili', 'na', 'jesam', 'od',
       'za', 'sa', 'iz', 'neki', 'da', 'imati', 'kao', 'koristiti',
       'nešto', 'rod', 'biljka', 'velik', 'jezik', 'biti', 'card',
       'porodica', 'jedan', 'mali', 'deo', 'drugi'], dtype='<U21')

In [62]:
len(encoder.get_vocabulary())

16907

In [63]:
model_NEG = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 128, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

In [64]:
model_NEG.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['binary_accuracy'])

In [65]:
history = model_NEG.fit(train_dataset, epochs=10,
                    validation_data=validation_dataset,
                    validation_steps=30)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [66]:
model_NEG.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_2 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding_2 (Embedding)     (None, None, 128)         2164096   
                                                                 
 bidirectional_8 (Bidirectio  (None, None, 256)        263168    
 nal)                                                            
                                                                 
 bidirectional_9 (Bidirectio  (None, None, 128)        164352    
 nal)                                                            
                                                                 
 bidirectional_10 (Bidirecti  (None, None, 64)         41216     
 onal)                                                

In [67]:
y_pred = model_NEG.predict(X_test)



In [68]:
# model.save(MOD_DIR + "nntest3")

In [69]:
y_pred[:10]

array([[2.2478557e-04],
       [1.2787411e-04],
       [4.8652360e-01],
       [2.1138774e-04],
       [1.3664686e-04],
       [5.0763995e-04],
       [2.6332040e-04],
       [2.4628590e-04],
       [2.6805002e-02],
       [1.3132399e-04]], dtype=float32)

In [70]:
y_pred = tf.round((y_pred))

In [71]:
con_mat = tf.math.confusion_matrix(labels=y_test, predictions=y_pred).numpy()


In [72]:
con_mat

array([[1568,   14],
       [  18,    8]], dtype=int32)

In [73]:
y_pred = model_NEG.predict(X_val)



In [74]:
con_mat = tf.math.confusion_matrix(labels=y_val, predictions=y_pred).numpy()


In [75]:
con_mat

array([[3954,    0],
       [  65,    0]], dtype=int32)

In [76]:
X = pd.read_csv(RES_DIR + "X_POS4.csv")["Sysnet"]
y = pd.read_csv(RES_DIR + "y_POS4.csv")["POS"]

In [77]:
X_pom, X_val, y_pom, y_val = train_test_split(X, y, test_size=.1, stratify=y, )
X_train, X_test, y_train, y_test = train_test_split(X_pom, y_pom, test_size=.2, stratify=y_pom, )

X_val = tf.convert_to_tensor(X_val, name ="Definicija")
y_val = tf.convert_to_tensor(y_val, name ="Sentiment")


X_train = tf.convert_to_tensor(X_train, name ="Definicija")
y_train = tf.convert_to_tensor(y_train, name ="Sentiment")

X_test = tf.convert_to_tensor(X_test, name ="Definicija")
y_test = tf.convert_to_tensor(y_test, name ="Sentiment")

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
validation_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))


In [78]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
validation_dataset = validation_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [79]:
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE, output_mode="int" )
encoder.adapt(train_dataset.map(lambda text, label: text))



In [80]:
model_POS = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 128, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

In [81]:
model_POS.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['binary_accuracy'])

In [82]:
history = model_POS.fit(train_dataset, epochs=10,
                    validation_data=validation_dataset,
                    validation_steps=30)

Epoch 1/10



Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [83]:
model_POS.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_3 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding_3 (Embedding)     (None, None, 128)         2167168   
                                                                 
 bidirectional_12 (Bidirecti  (None, None, 256)        263168    
 onal)                                                           
                                                                 
 bidirectional_13 (Bidirecti  (None, None, 128)        164352    
 onal)                                                           
                                                                 
 bidirectional_14 (Bidirecti  (None, None, 64)         41216     
 onal)                                                

In [84]:
y_pred = model_POS.predict(X_test)



In [85]:
# model.save(MOD_DIR + "nntest3")

In [86]:
y_pred[:10]

array([[0.00045263],
       [0.00313856],
       [0.000489  ],
       [0.00134825],
       [0.00071949],
       [0.00033932],
       [0.0003545 ],
       [0.00034656],
       [0.00038353],
       [0.00150369]], dtype=float32)

In [87]:
y_pred = tf.round((y_pred))

In [88]:
con_mat = tf.math.confusion_matrix(labels=y_test, predictions=y_pred).numpy()


In [89]:
con_mat

array([[3526,   39],
       [  35,   17]], dtype=int32)

In [90]:
y_pred = model_POS.predict(X_val)



In [91]:
con_mat = tf.math.confusion_matrix(labels=y_val, predictions=y_pred).numpy()


In [92]:
con_mat

array([[1981,    0],
       [  29,    0]], dtype=int32)

In [93]:
def polarity_correction(pos, neg):
  one = tf.convert_to_tensor(1.0)
  ret_pos = pos*(one - neg)
  ret_neg =neg * (one -pos)
  return ret_pos, ret_neg
  

In [94]:
sword = pd.read_csv(RES_DIR +"definicije_lematizone.csv", index_col=0)

In [95]:
definicije = sword["Definicija"]

In [96]:
tensor_def = tf.convert_to_tensor(definicije.to_numpy(dtype='str'))

In [97]:
tn_POS = model_POS.predict(tensor_def)
tn_NEG = model_NEG.predict(tensor_def)




In [98]:
tn_POSc, tn_NEGc = polarity_correction(tn_POS, tn_NEG)

In [99]:
sword["POS"] = tn_POSc.numpy()

In [100]:
sword["NEG"] = tn_NEGc.numpy()

In [101]:
sword

Unnamed: 0,ID,Definicija,POS,NEG
0,ENG30-03574555-n,zgrada u koji se nalaziti organizacioni jedini...,0.000358,0.000130
1,ENG30-07810907-n,pripremljen dodatak jela za poboljšanje ukus,0.000672,0.000296
2,ENG30-00721431-n,"u nečiji prilika , mogućnost",0.001147,0.002745
3,ENG30-00473799-v,ostati jesam još samo da se doterati neki fine...,0.000370,0.000165
4,ENG30-00903385-v,zapad on jesam oprostiti što se nekada računat...,0.000420,0.000174
...,...,...,...,...
25315,ENG30-15266265-n,mandat predsednik,0.032429,0.052206
25316,ENG30-15266685-n,vreme između početak i kraj vremenski perioda,0.000445,0.000206
25317,ENG30-15266911-n,tačka u vreme kada se nešto završavati,0.000496,0.000199
25318,ENG30-15276642-n,( računarstvo ) brzina prenos podatak ( npr . ...,0.000502,0.000299


In [102]:
sword.to_csv(RES_DIR + "sentiment_RNN4.csv")