In [1]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
ROOT_DIR = "/content/drive/MyDrive/Colab Notebooks/"
RES_DIR = ROOT_DIR + "resources/"
MOD_DIR = ROOT_DIR + "ml_models/"
BATCH_SIZE = 32

X = pd.read_csv(RES_DIR + "X_NEG6.csv")["Sysnet"]
y = pd.read_csv(RES_DIR + "y_NEG6.csv")["NEG"]


In [2]:
BUFFER_SIZE = 1000
BATCH_SIZE = 128


In [3]:
X_pom, X_val, y_pom, y_val = train_test_split(X, y, test_size=.2, stratify=y, )
X_train, X_test, y_train, y_test = train_test_split(X_pom, y_pom, test_size=.1, stratify=y_pom, )

X_val = tf.convert_to_tensor(X_val, name ="Definicija")
y_val = tf.convert_to_tensor(y_val, name ="Sentiment")


X_train = tf.convert_to_tensor(X_train, name ="Definicija")
y_train = tf.convert_to_tensor(y_train, name ="Sentiment")

X_test = tf.convert_to_tensor(X_test, name ="Definicija")
y_test = tf.convert_to_tensor(y_test, name ="Sentiment")

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
validation_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))


In [4]:
tf.math.confusion_matrix(labels=y_train, predictions=y_train).numpy()


array([[14234,     0],
       [    0,   246]], dtype=int32)

In [5]:
train_dataset =train_dataset.map(lambda x_text, x_label: (x_text, tf.expand_dims(x_label, -1)))
validation_dataset= validation_dataset.map(lambda x_text, x_label: (x_text, tf.expand_dims(x_label, -1)))

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [6]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
validation_dataset = validation_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [7]:
tf.shape(X_train)

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([14480], dtype=int32)>

In [8]:
import tensorflow_datasets as tfds
import numpy as np


In [9]:
VOCAB_SIZE = 25000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE, output_mode="int" )
encoder.adapt(train_dataset.map(lambda text, label: text))



In [10]:
vocab = np.array(encoder.get_vocabulary())
vocab[:30]


array(['', '[UNK]', 'koji', 'se', 'i', 'u', 'ili', 'na', 'jesam', 'od',
       'za', 'sa', 'iz', 'neki', 'da', 'imati', 'rod', 'koristiti', 'kao',
       'nešto', 'velik', 'biljka', 'jezik', 'biti', 'card', 'jedan',
       'porodica', 'mali', 'drugi', 'obično'], dtype='<U21')

In [11]:
len(encoder.get_vocabulary())

16882

In [12]:
model_NEG = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 128, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

In [13]:
model_NEG.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['binary_accuracy'])

In [14]:
history = model_NEG.fit(train_dataset, epochs=10,
                    validation_data=validation_dataset,
                    validation_steps=30)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
model_NEG.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, None)             0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, None, 128)         2160896   
                                                                 
 bidirectional (Bidirectiona  (None, None, 256)        263168    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, None, 128)        164352    
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, None, 64)         41216     
 nal)                                                   

In [16]:
y_pred = model_NEG.predict(X_test)



In [17]:
# model.save(MOD_DIR + "nntest3")

In [18]:
y_pred[:10]

array([[1.0272254e-04],
       [9.4024959e-05],
       [7.9553749e-05],
       [8.1186365e-05],
       [9.0282287e-05],
       [8.1729166e-05],
       [2.5117926e-02],
       [1.3590435e-04],
       [7.9330181e-05],
       [3.5700491e-01]], dtype=float32)

In [19]:
y_pred = tf.round((y_pred))

In [20]:
con_mat = tf.math.confusion_matrix(labels=y_test, predictions=y_pred).numpy()


In [21]:
con_mat

array([[1573,    9],
       [  20,    7]], dtype=int32)

In [22]:
y_pred = model_NEG.predict(X_val)



In [23]:
con_mat = tf.math.confusion_matrix(labels=y_val, predictions=y_pred).numpy()


In [24]:
con_mat

array([[3955,    0],
       [  68,    0]], dtype=int32)

In [25]:
X = pd.read_csv(RES_DIR + "X_POS6.csv")["Sysnet"]
y = pd.read_csv(RES_DIR + "y_POS6.csv")["POS"]

In [26]:
X_pom, X_val, y_pom, y_val = train_test_split(X, y, test_size=.1, stratify=y, )
X_train, X_test, y_train, y_test = train_test_split(X_pom, y_pom, test_size=.2, stratify=y_pom, )

X_val = tf.convert_to_tensor(X_val, name ="Definicija")
y_val = tf.convert_to_tensor(y_val, name ="Sentiment")


X_train = tf.convert_to_tensor(X_train, name ="Definicija")
y_train = tf.convert_to_tensor(y_train, name ="Sentiment")

X_test = tf.convert_to_tensor(X_test, name ="Definicija")
y_test = tf.convert_to_tensor(y_test, name ="Sentiment")

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
validation_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))


In [27]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
validation_dataset = validation_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [28]:
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE, output_mode="int" )
encoder.adapt(train_dataset.map(lambda text, label: text))



In [29]:
model_POS = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 128, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

In [30]:
model_POS.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['binary_accuracy'])

In [31]:
history = model_POS.fit(train_dataset, epochs=10,
                    validation_data=validation_dataset,
                    validation_steps=30)

Epoch 1/10



Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [32]:
model_POS.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, None, 128)         2162048   
                                                                 
 bidirectional_4 (Bidirectio  (None, None, 256)        263168    
 nal)                                                            
                                                                 
 bidirectional_5 (Bidirectio  (None, None, 128)        164352    
 nal)                                                            
                                                                 
 bidirectional_6 (Bidirectio  (None, None, 64)         41216     
 nal)                                                 

In [33]:
y_pred = model_POS.predict(X_test)



In [34]:
# model.save(MOD_DIR + "nntest3")

In [35]:
y_pred[:10]

array([[9.6595155e-05],
       [3.7997263e-05],
       [1.0476423e-04],
       [8.9625979e-04],
       [3.7378733e-05],
       [3.6157311e-05],
       [3.8138223e-05],
       [5.7904388e-05],
       [3.0970539e-04],
       [3.7089809e-05]], dtype=float32)

In [36]:
y_pred = tf.round((y_pred))

In [37]:
con_mat = tf.math.confusion_matrix(labels=y_test, predictions=y_pred).numpy()


In [38]:
con_mat

array([[3522,   45],
       [  36,   17]], dtype=int32)

In [39]:
y_pred = model_POS.predict(X_val)



In [40]:
con_mat = tf.math.confusion_matrix(labels=y_val, predictions=y_pred).numpy()


In [41]:
con_mat

array([[1982,    0],
       [  30,    0]], dtype=int32)

In [42]:
def polarity_correction(pos, neg):
  one = tf.convert_to_tensor(1.0)
  ret_pos = pos*(one - neg)
  ret_neg =neg * (one -pos)
  return ret_pos, ret_neg
  

In [43]:
sword = pd.read_csv(RES_DIR +"definicije_lematizone.csv", index_col=0)

In [44]:
definicije = sword["Definicija"]

In [45]:
tensor_def = tf.convert_to_tensor(definicije.to_numpy(dtype='str'))

In [46]:
tn_POS = model_POS.predict(tensor_def)
tn_NEG = model_NEG.predict(tensor_def)




In [47]:
tn_POSc, tn_NEGc = polarity_correction(tn_POS, tn_NEG)

In [48]:
sword["POS"] = tn_POSc.numpy()

In [49]:
sword["NEG"] = tn_NEGc.numpy()

In [50]:
sword

Unnamed: 0,ID,Definicija,POS,NEG
0,ENG30-03574555-n,zgrada u koji se nalaziti organizacioni jedini...,0.000045,0.000086
1,ENG30-07810907-n,pripremljen dodatak jela za poboljšanje ukus,0.000524,0.000197
2,ENG30-00721431-n,"u nečiji prilika , mogućnost",0.000333,0.000914
3,ENG30-00473799-v,ostati jesam još samo da se doterati neki fine...,0.000113,0.000092
4,ENG30-00903385-v,zapad on jesam oprostiti što se nekada računat...,0.000091,0.000096
...,...,...,...,...
25315,ENG30-15266265-n,mandat predsednik,0.112033,0.020582
25316,ENG30-15266685-n,vreme između početak i kraj vremenski perioda,0.000101,0.000118
25317,ENG30-15266911-n,tačka u vreme kada se nešto završavati,0.000210,0.000127
25318,ENG30-15276642-n,( računarstvo ) brzina prenos podatak ( npr . ...,0.000128,0.000131


In [51]:
sword.to_csv(RES_DIR + "sentiment_RNN6.csv")