# $\text{Import data}$

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("contest1_train.csv")
print(df.shape)
df.head()

(3156, 4)


Unnamed: 0,id,text,aspectCategory,polarity
0,3121,But the staff was so horrible to us.,service,negative
1,2777,"To be completely fair, the only redeeming fact...",food,positive
2,2777,"To be completely fair, the only redeeming fact...",anecdotes/miscellaneous,negative
3,1634,"The food is uniformly exceptional, with a very...",food,positive
4,2534,Where Gabriela personaly greets you and recomm...,service,positive


In [2]:
def drop_dups(df, name:str):
    if any(df.duplicated()):
        dups = df[df.duplicated()]
        return df.drop_duplicates()
    
df = drop_dups(df, "Whole data")

df_aspect = df[['text', 'aspectCategory']]
df_sentiment = df[['text', 'polarity']]

df_aspect = drop_dups(df_aspect, "aspect")
df_sentiment = drop_dups(df_sentiment, "sentiment")

# $\text{1. Sentiment}$
- Rule-based
- BOW
- TF-IDF
- Bidirectional GRU
- CNN

In [3]:
from my_models import utils, sentiment

# Drop texts that are duplicated
df_sentiment = df_sentiment.drop_duplicates(subset=['text'], keep='last')

X_TRAIN_sent, X_DEV_sent, Y_TRAIN_sent, Y_DEV_sent = utils.split_data(df_sentiment['text'], df_sentiment['polarity'])

## 1.1) Rule-based

In [7]:
y_pred = sentiment.VADER(df_sentiment['text'])
utils.get_reports(y_true = df_sentiment['polarity'], y_pred=y_pred)

              precision    recall  f1-score   support

    conflict       0.50      0.01      0.01       148
    negative       0.79      0.03      0.06       602
     neutral       0.15      0.98      0.26       354
    positive       0.94      0.14      0.24      1478

    accuracy                           0.22      2582
   macro avg       0.60      0.29      0.14      2582
weighted avg       0.77      0.22      0.19      2582



## 1.2) Logistic regression (bow)

In [6]:
logreg_sent_bow = sentiment.LOGREG(feature_mode="BOW", max_iter=200)
# preprocess
X_train_sent = logreg_sent_bow.preprocess(X_TRAIN_sent.values)
X_dev_sent = logreg_sent_bow.preprocess(X_DEV_sent.values)

# train
logreg_sent_bow.fit(X_train_sent, Y_TRAIN_sent)

# inference
y_pred = logreg_sent_bow.predict(X_dev_sent)
utils.get_reports(y_true = Y_DEV_sent, y_pred=y_pred)

Creating new BOW vectorizer...
BOW matrix: (2065, 3589)
              precision    recall  f1-score   support

    conflict       0.43      0.30      0.35        30
    negative       0.63      0.53      0.58       120
     neutral       0.54      0.27      0.36        71
    positive       0.73      0.89      0.80       296

    accuracy                           0.68       517
   macro avg       0.58      0.50      0.52       517
weighted avg       0.66      0.68      0.66       517



## 1.3) Logistic regression (tf-idf)

In [9]:
logreg_sent_tfidf = sentiment.LOGREG(feature_mode="TFIDF", max_iter=200)
# preprocess
X_train_sent = logreg_sent_tfidf.preprocess(X_TRAIN_sent.values)
X_dev_sent = logreg_sent_tfidf.preprocess(X_DEV_sent.values)

# train
logreg_sent_tfidf.fit(X_train_sent, Y_TRAIN_sent)

# inference
y_pred = logreg_sent_tfidf.predict(X_dev_sent)
utils.get_reports(y_true = Y_DEV_sent, y_pred=y_pred)

Creating new TFIDF vectorizer...
TFIDF matrix: (2065, 3589)
              precision    recall  f1-score   support

    conflict       0.00      0.00      0.00        30
    negative       0.66      0.42      0.52       120
     neutral       0.91      0.14      0.24        71
    positive       0.67      0.96      0.79       296

    accuracy                           0.67       517
   macro avg       0.56      0.38      0.39       517
weighted avg       0.66      0.67      0.61       517



## 1.4) Bidirectional GRU (GloVe)

In [4]:
emb_dim = 300
vocab, embedding_matrix = utils.get_embeddings(emb_dim)

maxlen = 30
vocab_size = embedding_matrix.shape[0]

In [5]:
from my_models.inference import get_rnn, get_cnn

rnn_params = dict(
    rnn_layers=[128,128], 
    dense_layers=[64,64], 
    embedding_matrix=embedding_matrix, 
    n_outputs=len(Y_TRAIN_sent.unique()), 
    embedding_trainable=True
)

In [6]:
rnn_sent = sentiment.dl_glove(vocab)

In [7]:
## Reinstantiate model
rnn_sent.set_model_template(get_rnn(**rnn_params))

Using pretrained word embedding
cloning model from template...


In [8]:
X_train_sent, Y_train_sent = rnn_sent.preprocess(X_TRAIN_sent.values, Y_TRAIN_sent.values, maxlen=30)
X_dev_sent, Y_dev_sent = rnn_sent.preprocess(X_DEV_sent.values, Y_DEV_sent.values, maxlen=30)

...Build new LabelEncoder


In [9]:
rnn_sent.le.classes_

array(['conflict', 'negative', 'neutral', 'positive'], dtype=object)

In [10]:
history_rnn_sent = rnn_sent.fit(
    X_train_sent, Y_train_sent, X_dev_sent, Y_dev_sent,
    batch_size = 32, epochs = 6
)

Epoch 1/6
12/65 [====>.........................] - ETA: 1:22 - loss: 1.1378 - accuracy: 0.5547

KeyboardInterrupt: 

In [23]:
y_pred = rnn_sent.predict(X_dev_sent)
utils.get_reports(
    y_true = [rnn_sent.le.classes_[i] for i in Y_dev_sent], 
    y_pred= [rnn_sent.le.classes_[i] for i in y_pred]
) # trainable embed

              precision    recall  f1-score   support

    conflict       0.25      0.23      0.24        30
    negative       0.51      0.53      0.52       120
     neutral       0.34      0.38      0.36        71
    positive       0.78      0.75      0.77       296

    accuracy                           0.62       517
   macro avg       0.47      0.47      0.47       517
weighted avg       0.63      0.62      0.62       517



In [19]:
y_pred = rnn_sent.predict(X_dev_sent)
utils.get_reports(
    y_true = [rnn_sent.le.classes_[i] for i in Y_dev_sent], 
    y_pred= [rnn_sent.le.classes_[i] for i in y_pred]
) #300d

              precision    recall  f1-score   support

    conflict       0.16      0.10      0.12        30
    negative       0.50      0.45      0.48       120
     neutral       0.55      0.08      0.15        71
    positive       0.67      0.86      0.76       296

    accuracy                           0.62       517
   macro avg       0.47      0.37      0.38       517
weighted avg       0.59      0.62      0.57       517



In [12]:
global_le = rnn_sent.le

## 1.5) CNN (glove)

In [39]:
cnn_params = dict(
    n_filters = 64,
    kernel_size = 3,
    n_cnn_layers = 3,
    dense_layers = [64,64],
    embedding_matrix = embedding_matrix,
    n_outputs = len(Y_TRAIN_sent.unique()),
    embedding_trainable=True
)

In [34]:
cnn_sent = sentiment.dl_glove(vocab, le=global_le)

In [40]:
cnn_sent.set_model_template(get_cnn(**cnn_params))

Using pretrained word embedding
cloning model from template...


In [24]:
#X_train_sent, Y_train_sent = cnn_sent.preprocess(X_TRAIN_sent.values, Y_TRAIN_sent.values, maxlen=maxlen)
#X_dev_sent, Y_dev_sent = cnn_sent.preprocess(X_DEV_sent.values, Y_DEV_sent.values, maxlen=maxlen)

...Build new LabelEncoder


In [41]:
history_cnn_sent = cnn_sent.fit(
    X_train_sent, Y_train_sent, X_dev_sent, Y_dev_sent,
    batch_size = 32, epochs = 5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [42]:
y_pred = cnn_sent.predict(X_dev_sent)
utils.get_reports(
    y_true = [cnn_sent.le.classes_[i] for i in Y_dev_sent], 
    y_pred= [cnn_sent.le.classes_[i] for i in y_pred]
) # trainable embed

              precision    recall  f1-score   support

    conflict       0.00      0.00      0.00        30
    negative       0.38      0.78      0.51       120
     neutral       0.00      0.00      0.00        71
    positive       0.77      0.70      0.73       296

    accuracy                           0.58       517
   macro avg       0.29      0.37      0.31       517
weighted avg       0.53      0.58      0.54       517



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
y_pred = cnn_sent.predict(X_dev_sent)
utils.get_reports(
    y_true = [cnn_sent.le.classes_[i] for i in Y_dev_sent], 
    y_pred= [cnn_sent.le.classes_[i] for i in y_pred]
) #non-trainable

              precision    recall  f1-score   support

    conflict       0.00      0.00      0.00        30
    negative       0.57      0.07      0.12       120
     neutral       0.00      0.00      0.00        71
    positive       0.58      0.99      0.74       296

    accuracy                           0.58       517
   macro avg       0.29      0.26      0.21       517
weighted avg       0.47      0.58      0.45       517



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 1.6) BOW NN

In [11]:
import tensorflow as tf

def get_bowNN(dense_layers):
    model = tf.keras.models.Sequential()
    for units in dense_layers:
        model.add(tf.keras.layers.Dense(units, activation='relu'))
    model.add(tf.keras.layers.Dense(len(Y_TRAIN_sent.unique()), activation='softmax'))
    
    return model

In [8]:
bow_nn = sentiment.dl(get_bowNN([512, 128]))

In [None]:
bow_nn.preprocess(X_TRAIN_sent.values, Y_TRAIN_sent.values, maxlen=30)

# $\text{2. Aspect}$

In [7]:
temp_df = pd.pivot_table(
                df_aspect,
                index='text',
                values='aspectCategory',
                aggfunc=lambda x: list(x)
            )

aspects = df_aspect.aspectCategory.unique()
print(aspects)

for a in aspects:
    temp_df[a] = temp_df.apply(lambda x: 1 if a in x.aspectCategory else 0, axis=1)
    
temp_df.head()

['service' 'food' 'anecdotes/miscellaneous' 'price' 'ambience']


Unnamed: 0_level_0,aspectCategory,service,food,anecdotes/miscellaneous,price,ambience
text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"$160 for 2 filets, 2 sides, an appetizer and drinks.","[food, price]",0,1,0,1,0
$20 for all you can eat sushi cannot be beaten.,[price],0,0,0,1,0
$20 gets you unlimited sushi of a very high quality- I even took a friend here from Japan who said it was one of the best sushi places in the US that he has been to.,"[food, price]",0,1,0,1,0
"($200 for 2 glasses of champagne, not too expensive bottle of wine and 2 after dinner drinks).",[price],0,0,0,1,0
(Always ask the bartender for the SEASONAL beer!!!,[food],0,1,0,0,0


In [8]:
X_TRAIN_asp, X_DEV_asp, Y_TRAIN_asp, Y_DEV_asp = utils.split_data(temp_df.index, temp_df.iloc[:, -5:], stratify=False)

## 2.1) Logistic regression (BOW)

In [9]:
from my_models import aspect

logreg_aspect_bow = aspect.LOGREG(feature_mode='BOW')
X_train_asp = logreg_aspect_bow.preprocess(X_TRAIN_asp, Y_TRAIN_asp)
X_dev_asp = logreg_aspect_bow.preprocess(X_DEV_asp, Y_DEV_asp)

logreg_aspect_bow.fit(X_train_asp, Y_TRAIN_asp)

outputs = logreg_aspect_bow.predict(X_dev_asp)

utils.get_reports(
    y_true = Y_DEV_asp.values, 
    y_pred= outputs
)

Creating new models
Creating new BOW vectorizer...
BOW matrix: (2065, 3633)
predicting service...
predicting food...
predicting anecdotes/miscellaneous...
predicting price...
predicting ambience...
              precision    recall  f1-score   support

           0       0.91      0.68      0.78       106
           1       0.88      0.79      0.84       209
           2       0.77      0.74      0.76       191
           3       0.88      0.54      0.67        56
           4       0.91      0.44      0.59        73

   micro avg       0.85      0.70      0.76       635
   macro avg       0.87      0.64      0.73       635
weighted avg       0.86      0.70      0.76       635
 samples avg       0.72      0.70      0.70       635



  _warn_prf(average, modifier, msg_start, len(result))


## 2.2) Logistic regerssion (TF-IDF)

In [47]:
from my_models import aspect

logreg_aspect_tfidf = aspect.LOGREG(feature_mode='TFIDF')
X_train_asp = logreg_aspect_tfidf.preprocess(X_TRAIN_asp, Y_TRAIN_asp)
X_dev_asp = logreg_aspect_tfidf.preprocess(X_DEV_asp, Y_DEV_asp)

logreg_aspect_tfidf.fit(X_train_asp, Y_TRAIN_asp)

outputs = logreg_aspect_tfidf.predict(X_dev_asp)

utils.get_reports(
    y_true = Y_DEV_asp.values, 
    y_pred= outputs
)

Creating new models
Creating new TFIDF vectorizer...
TFIDF matrix: (2065, 3633)
predicting service...
predicting food...
predicting anecdotes/miscellaneous...
predicting price...
predicting ambience...
              precision    recall  f1-score   support

           0       1.00      0.48      0.65       106
           1       0.91      0.72      0.80       209
           2       0.83      0.59      0.69       191
           3       0.89      0.14      0.25        56
           4       0.89      0.11      0.20        73

   micro avg       0.89      0.52      0.66       635
   macro avg       0.90      0.41      0.52       635
weighted avg       0.90      0.52      0.62       635
 samples avg       0.59      0.55      0.56       635



  _warn_prf(average, modifier, msg_start, len(result))


## 2.3) Bidirectional GRU (GloVe)

In [10]:
Y_TRAIN_asp.columns

Index(['service', 'food', 'anecdotes/miscellaneous', 'price', 'ambience'], dtype='object')

In [19]:
rnn_params = dict(
    rnn_layers=[128,128], 
    dense_layers=[64,64], 
    embedding_matrix=embedding_matrix, 
    n_outputs=1, 
    embedding_trainable=False
)

In [13]:
from my_models import aspect
rnn_asp = aspect.dl_glove(vocab)

In [20]:
# Reinstantiate models
rnn_asp.set_model_template(get_rnn(**rnn_params), n_models = len(Y_TRAIN_asp.columns))

Using pretrained word embedding
cloning model from template...


In [17]:
X_train_asp, Y_train_asp = rnn_asp.preprocess(X_TRAIN_asp, Y_TRAIN_asp, maxlen=30)
X_dev_asp, Y_dev_asp = rnn_asp.preprocess(X_DEV_asp, Y_DEV_asp, maxlen=30)

In [21]:
histories = rnn_asp.fit(
                X_train_asp, Y_train_asp, X_dev_asp, Y_dev_asp,
                batch_size = 32, epochs = 5
            )

fitting service ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fitting food ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fitting anecdotes/miscellaneous ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fitting price ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fitting ambience ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
Y_train_asp.columns

In [34]:
import tensorflow as tf
def manual_predict(X, threshold=0.5):
    outputs = []
    threshold = 0.5
    for aspect, model in zip(Y_train_asp.columns, rnn_asp.models):
        print(f'predicting {aspect}...')
        y_pred_target = model.predict(X)
        y_pred = tf.cast(y_pred_target > threshold, tf.int32) 
        outputs.append(y_pred.numpy().ravel())

    outputs = np.transpose(np.array(outputs))
    return outputs

In [22]:
outputs = rnn_asp.predict(X_dev_asp)
utils.get_reports(
    y_true = Y_dev_asp.values, 
    y_pred= outputs
) #non-trainable 300d

predicting service...
predicting food...
predicting anecdotes/miscellaneous...
predicting price...
predicting ambience...
              precision    recall  f1-score   support

           0       0.80      0.71      0.75       106
           1       0.70      0.80      0.75       209
           2       0.69      0.76      0.72       191
           3       0.75      0.48      0.59        56
           4       0.45      0.32      0.37        73

   micro avg       0.69      0.69      0.69       635
   macro avg       0.68      0.61      0.64       635
weighted avg       0.69      0.69      0.68       635
 samples avg       0.67      0.70      0.66       635



  _warn_prf(average, modifier, msg_start, len(result))


## 2.4) CNN (GloVe)

In [41]:
cnn_params = dict(
    n_filters = 64,
    kernel_size = 3,
    n_cnn_layers = 3,
    dense_layers = [64,64],
    embedding_matrix = embedding_matrix,
    n_outputs = 1,
    embedding_trainable=False
)

In [38]:
cnn_asp = aspect.dl_glove(vocab)

In [42]:
# Reinstantiate models
cnn_asp.set_model_template(get_cnn(**cnn_params), n_models = len(Y_TRAIN_asp.columns))

Using pretrained word embedding
cloning model from template...


In [43]:
histories = cnn_asp.fit(
                X_train_asp, Y_train_asp, X_dev_asp, Y_dev_asp,
                batch_size = 32, epochs = 5
            )

fitting service ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fitting food ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fitting anecdotes/miscellaneous ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fitting price ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fitting ambience ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [45]:
def manual_predict(X, threshold=0.5):
    outputs = []
    threshold = 0.5
    for aspect, model in zip(Y_train_asp.columns, cnn_asp.models):
        print(f'predicting {aspect}...')
        y_pred_target = model.predict(X)
        y_pred = tf.cast(y_pred_target > threshold, tf.int32) 
        outputs.append(y_pred.numpy().ravel())

    outputs = np.transpose(np.array(outputs))
    return outputs

outputs = manual_predict(X_dev_asp)
utils.get_reports(
    y_true = Y_dev_asp.values, 
    y_pred= outputs
) #non-trainable 300d

predicting service...
predicting food...
predicting anecdotes/miscellaneous...
predicting price...
predicting ambience...
              precision    recall  f1-score   support

           0       0.85      0.58      0.69       106
           1       0.87      0.57      0.69       209
           2       0.72      0.67      0.69       191
           3       0.83      0.18      0.29        56
           4       0.44      0.05      0.10        73

   micro avg       0.79      0.51      0.62       635
   macro avg       0.74      0.41      0.49       635
weighted avg       0.77      0.51      0.59       635
 samples avg       0.56      0.53      0.54       635



  _warn_prf(average, modifier, msg_start, len(result))


# $\text{Inference}$

In [68]:
import itertools
from my_models import inference

sent_names = ['logreg_sent_bow','logreg_sent_tfidf','rnn_sent','cnn_sent']
asp_names = ['logreg_aspect','rnn_asp']
model_names = itertools.product(sent_names, asp_names)

sent_models = [
    logreg_sent_bow,
    logreg_sent_tfidf,
    rnn_sent,
    cnn_sent
]

asp_models = [
    logreg_aspect,
    rnn_asp
]
models = itertools.product(sent_models, asp_models)

In [69]:
df_train = pd.read_csv("contest1_train.csv")
df_train_inference = df_train[['id','text']]

for model, name in zip(models, model_names):
    print(name)
    inferencer = inference.InferenceModel(*model)
    outputs = inferencer.predict(df_train_inference)

    outputs.to_csv(f"resulting_predictions/{name[0]}_{name[1]}_train-set.csv")

('logreg_sent_bow', 'logreg_aspect')


TypeError: MulBinary_logreg.preprocess() got an unexpected keyword argument 'vocab_size'

## TEST set

In [23]:
from my_models import inference

inferencer = inference.InferenceModel(logreg_sent_bow, rnn_asp)

df_test = pd.read_csv("contest1_train.csv")

df_train_inference = df_test[['id','text']]
outputs = inferencer.predict(df_train_inference)

predicting service...
predicting food...
predicting anecdotes/miscellaneous...
predicting price...
predicting ambience...


In [24]:
outputs

Unnamed: 0_level_0,aspectCategory,polarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3121,service,negative
2777,food,negative
2777,food,negative
1634,food,positive
2534,service,positive
...,...,...
1163,service,positive
216,anecdotes/miscellaneous,positive
1109,food,positive
899,service,positive


In [25]:
path_to_save = "resulting_predictions/"
outputs.to_csv(path_to_save + "train-pred-2-bow-rnn.csv")

In [20]:
import tensorflow as tf

max_len = 4  

vectorize_layer = tf.keras.layers.TextVectorization(
 output_mode='int',
 vocabulary=vocab)


vectorize_layer.get_vocabulary()[:10]

['', '[UNK]', 'the', ',', '.', 'of', 'to', 'and', 'in', 'a']

In [21]:
vectorize_layer(['Either an array of strings or a string path to a text file', 'f set, the output will have its time dimension padded'])

<tf.Tensor: shape=(2, 13), dtype=int64, numpy=
array([[  902,    31,  6737,     5,  9567,    48,     9,  3760,  2820,
            6,     9,  2831,  2856],
       [ 3882,   210,     2,  2758,    45,    35,    49,    81, 10134,
        27754,     0,     0,     0]], dtype=int64)>