# $\text{Import data}$

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from my_models.inference import get_rnn, get_cnn

df = pd.read_csv("contest1_train.csv")
print(df.shape)
df.head()

(3156, 4)


Unnamed: 0,id,text,aspectCategory,polarity
0,3121,But the staff was so horrible to us.,service,negative
1,2777,"To be completely fair, the only redeeming fact...",food,positive
2,2777,"To be completely fair, the only redeeming fact...",anecdotes/miscellaneous,negative
3,1634,"The food is uniformly exceptional, with a very...",food,positive
4,2534,Where Gabriela personaly greets you and recomm...,service,positive


In [2]:
from my_models import utils

emb_dim = 100
vocab, embedding_matrix = utils.get_embeddings(emb_dim)

maxlen = 30
vocab_size = embedding_matrix.shape[0]

In [3]:
def drop_dups(df, name:str):
    if any(df.duplicated()):
        dups = df[df.duplicated()]
        return df.drop_duplicates()
    
df = drop_dups(df, "Whole data")

from sklearn.model_selection import train_test_split
TRAIN, DEV = train_test_split(df, test_size=0.25, random_state=42)

df_aspect = TRAIN[['id','text', 'aspectCategory']]
df_sentiment = TRAIN[['id','text', 'polarity']]

df_aspect = drop_dups(df_aspect, "aspect")
df_sentiment = drop_dups(df_sentiment, "sentiment")

In [10]:
DEV.to_csv('DEV.csv', index=False)

In [11]:
pd.read_csv("DEV.csv")

Unnamed: 0,id,text,aspectCategory,polarity
0,1315,Amma has the worst value for money I have expe...,price,negative
1,2576,By far the best salad I have had in a fast foo...,food,positive
2,2850,"The food was amazing, the service was so atten...",ambience,positive
3,301,"There was a long wait for a table outside, but...",service,conflict
4,87,"Having hunted around for a quiet, romantic, ye...",anecdotes/miscellaneous,neutral
...,...,...,...,...
784,1082,"So, the menu is written in chalk above your he...",food,positive
785,3243,"Hopefully next time, I will save room for dess...",food,neutral
786,1191,Knowledge of the chef and the waitress are bel...,food,negative
787,1380,Definately check it out!!!,anecdotes/miscellaneous,positive


# $\text{1. Sentiment}$
- Rule-based
- BOW
- TF-IDF
- Bidirectional GRU
- CNN

In [4]:
from my_models import sentiment

# Drop texts that are duplicated
df_sentiment = df_sentiment.drop_duplicates(subset=['text'], keep='first')

X_TRAIN_sent, X_DEV_sent, Y_TRAIN_sent, Y_DEV_sent = TRAIN['text'], DEV['text'], TRAIN['polarity'], DEV['polarity']

## 1.1) Rule-based

In [7]:
y_pred = sentiment.VADER(df_sentiment['text'])
utils.get_reports(y_true = df_sentiment['polarity'], y_pred=y_pred)

              precision    recall  f1-score   support

    conflict       0.50      0.01      0.01       148
    negative       0.79      0.03      0.06       602
     neutral       0.15      0.98      0.26       354
    positive       0.94      0.14      0.24      1478

    accuracy                           0.22      2582
   macro avg       0.60      0.29      0.14      2582
weighted avg       0.77      0.22      0.19      2582



## 1.2) Logistic regression (bow)

In [5]:
from sklearn.linear_model import LogisticRegression

logreg_sent_bow = sentiment.ml(feature_mode="BOW", model=LogisticRegression, max_iter=200)
# preprocess
X_train_sent = logreg_sent_bow.preprocess(X_TRAIN_sent.values)
X_dev_sent = logreg_sent_bow.preprocess(X_DEV_sent.values)

# train
logreg_sent_bow.fit(X_train_sent, Y_TRAIN_sent)

# inference
y_pred = logreg_sent_bow.predict(X_dev_sent)
utils.get_reports(y_true = Y_DEV_sent, y_pred=y_pred)

Creating new BOW vectorizer...
BOW matrix: (2365, 3570)
              precision    recall  f1-score   support

    conflict       0.29      0.13      0.18        47
    negative       0.59      0.48      0.53       178
     neutral       0.45      0.30      0.36       100
    positive       0.75      0.89      0.81       464

    accuracy                           0.68       789
   macro avg       0.52      0.45      0.47       789
weighted avg       0.65      0.68      0.65       789



## 1.3) Logistic regression (tf-idf)

In [6]:
logreg_sent_tfidf = sentiment.ml(feature_mode="TFIDF", model=LogisticRegression, max_iter=200)
# preprocess
X_train_sent = logreg_sent_tfidf.preprocess(X_TRAIN_sent.values)
X_dev_sent = logreg_sent_tfidf.preprocess(X_DEV_sent.values)

# train
logreg_sent_tfidf.fit(X_train_sent, Y_TRAIN_sent)

# inference
y_pred = logreg_sent_tfidf.predict(X_dev_sent)
utils.get_reports(y_true = Y_DEV_sent, y_pred=y_pred)

Creating new TFIDF vectorizer...
TFIDF matrix: (2365, 3570)
              precision    recall  f1-score   support

    conflict       1.00      0.02      0.04        47
    negative       0.65      0.45      0.53       178
     neutral       0.76      0.16      0.26       100
    positive       0.69      0.96      0.81       464

    accuracy                           0.69       789
   macro avg       0.78      0.40      0.41       789
weighted avg       0.71      0.69      0.63       789



## 1.4) Bidirectional GRU (GloVe)

In [7]:
rnn_params = dict(
    rnn_layers=[128], 
    dense_layers=[64], 
    embedding_matrix=embedding_matrix, 
    n_outputs=len(Y_TRAIN_sent.unique()), 
    embedding_trainable=False
)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
compile_info = dict(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics='accuracy')

In [8]:
rnn_sent = sentiment.dl_pretrained(vocab, compile_info)

In [9]:
## Reinstantiate model
rnn_sent.set_model_template(get_rnn(**rnn_params))

Using pretrained word embedding


In [10]:
X_train_sent, Y_train_sent = rnn_sent.preprocess(X_TRAIN_sent.values, Y_TRAIN_sent.values)
X_dev_sent, Y_dev_sent = rnn_sent.preprocess(X_DEV_sent.values, Y_DEV_sent.values)

...Build new LabelEncoder


In [11]:
rnn_sent.le.classes_

array(['conflict', 'negative', 'neutral', 'positive'], dtype=object)

In [12]:
history_rnn_sent = rnn_sent.fit(
    X_train_sent, Y_train_sent, X_dev_sent, Y_dev_sent,
    batch_size = 64, epochs = 6
)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [13]:
y_pred = rnn_sent.predict(X_dev_sent)
utils.get_reports(
    y_true = [rnn_sent.le.classes_[i] for i in Y_dev_sent], 
    y_pred= [rnn_sent.le.classes_[i] for i in y_pred]
) # trainable embed

              precision    recall  f1-score   support

    conflict       0.00      0.00      0.00        47
    negative       0.35      0.44      0.39       178
     neutral       0.00      0.00      0.00       100
    positive       0.65      0.79      0.71       464

    accuracy                           0.57       789
   macro avg       0.25      0.31      0.28       789
weighted avg       0.46      0.57      0.51       789



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
y_pred = rnn_sent.predict(X_dev_sent)
utils.get_reports(
    y_true = [rnn_sent.le.classes_[i] for i in Y_dev_sent], 
    y_pred= [rnn_sent.le.classes_[i] for i in y_pred]
) #100d

              precision    recall  f1-score   support

    conflict       0.00      0.00      0.00        23
    negative       0.70      0.16      0.26        99
     neutral       0.50      0.02      0.03        59
    positive       0.61      0.99      0.75       248

    accuracy                           0.61       429
   macro avg       0.45      0.29      0.26       429
weighted avg       0.58      0.61      0.50       429



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
global_le = rnn_sent.le

## 1.5) CNN (glove)

In [14]:
cnn_params = dict(
    n_filters = 16,
    kernel_size = 4,
    n_cnn_layers = 3,
    dense_layers = [64],
    embedding_matrix = embedding_matrix,
    n_outputs = len(Y_TRAIN_sent.unique()),
    embedding_trainable=False
)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
compile_info = dict(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics='accuracy')

In [15]:
cnn_sent = sentiment.dl_pretrained(vocab, compile_info)

In [16]:
cnn_sent.set_model_template(get_cnn(**cnn_params))

Using pretrained word embedding


In [24]:
#X_train_sent, Y_train_sent = cnn_sent.preprocess(X_TRAIN_sent.values, Y_TRAIN_sent.values, maxlen=maxlen)
#X_dev_sent, Y_dev_sent = cnn_sent.preprocess(X_DEV_sent.values, Y_DEV_sent.values, maxlen=maxlen)

...Build new LabelEncoder


In [17]:
history_cnn_sent = cnn_sent.fit(
    X_train_sent, Y_train_sent, X_dev_sent, Y_dev_sent,
    batch_size = 32, epochs = 10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [20]:
y_pred = cnn_sent.predict(X_dev_sent)
utils.get_reports(
    y_true = [global_le.classes_[i] for i in Y_dev_sent], 
    y_pred= [global_le.classes_[i] for i in y_pred]
) # trainable embed

              precision    recall  f1-score   support

    conflict       0.00      0.00      0.00        47
    negative       0.00      0.00      0.00       178
     neutral       0.00      0.00      0.00       100
    positive       0.59      1.00      0.74       464

    accuracy                           0.59       789
   macro avg       0.15      0.25      0.19       789
weighted avg       0.35      0.59      0.44       789



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
y_pred = cnn_sent.predict(X_dev_sent)
utils.get_reports(
    y_true = [global_le.classes_[i] for i in Y_dev_sent], 
    y_pred= [global_le.classes_[i] for i in y_pred]
) #non-trainable

              precision    recall  f1-score   support

    conflict       0.00      0.00      0.00        30
    negative       0.37      0.41      0.39       120
     neutral       0.00      0.00      0.00        71
    positive       0.66      0.85      0.74       296

    accuracy                           0.58       517
   macro avg       0.26      0.31      0.28       517
weighted avg       0.46      0.58      0.51       517



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 1.6) BOW NN

In [21]:
import tensorflow as tf

def get_bowNN(dense_layers):
    model = tf.keras.models.Sequential()
    for units in dense_layers:
        model.add(tf.keras.layers.Dense(units, activation='relu'))
        model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(len(Y_TRAIN_sent.unique()), activation='softmax'))
    
    return model

optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
compile_info = dict(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics='accuracy')

In [22]:
bow_nn = sentiment.dl(compile_info, is_bow=True)

In [23]:
bow_nn.set_model_template(get_bowNN([1000,128]))

In [24]:
X_train_sent_bow,Y_train_sent_bow = bow_nn.preprocess(X_TRAIN_sent.values, Y_TRAIN_sent.values, maxtokens=5000)
X_dev_sent_bow,Y_dev_sent_bow = bow_nn.preprocess(X_DEV_sent.values, Y_DEV_sent.values, maxtokens=5000)

...Adapting new Tokenizer
...Build new LabelEncoder


In [25]:
history = bow_nn.fit(
    X_train_sent_bow, Y_train_sent_bow, X_dev_sent_bow, Y_dev_sent_bow,
    batch_size = 32, epochs = 7
)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [26]:
y_pred = bow_nn.predict(X_dev_sent_bow)
utils.get_reports(
    y_true = [bow_nn.le.classes_[i] for i in Y_dev_sent_bow], 
    y_pred= [bow_nn.le.classes_[i] for i in y_pred]
) #non-trainable

              precision    recall  f1-score   support

    conflict       0.33      0.15      0.21        47
    negative       0.61      0.46      0.52       178
     neutral       0.39      0.41      0.40       100
    positive       0.76      0.87      0.81       464

    accuracy                           0.68       789
   macro avg       0.52      0.47      0.49       789
weighted avg       0.66      0.68      0.66       789



# $\text{2. Aspect}$

In [27]:
def prep_aspect_df(df_aspect):
    temp_df = pd.pivot_table(
                    df_aspect,
                    index='text',
                    values='aspectCategory',
                    aggfunc=lambda x: list(x)
                )

    aspects = df_aspect.aspectCategory.unique()
    print(aspects)

    for a in aspects:
        temp_df[a] = temp_df.apply(lambda x: 1 if a in x.aspectCategory else 0, axis=1)

    return temp_df

TRAIN_aspect = prep_aspect_df(TRAIN)
DEV_aspect = prep_aspect_df(DEV)

['food' 'price' 'ambience' 'service' 'anecdotes/miscellaneous']
['price' 'food' 'ambience' 'service' 'anecdotes/miscellaneous']


In [28]:
X_TRAIN_asp, X_DEV_asp, Y_TRAIN_asp, Y_DEV_asp = TRAIN_aspect.index, DEV_aspect.index, TRAIN_aspect.iloc[:,-5:], DEV_aspect.iloc[:,-5:]

In [54]:
from sklearn.linear_model import LogisticRegression
from my_models import aspect

logreg_asp_bow = aspect.ml(feature_mode="TFIDF", model=LogisticRegression)
# preprocess
X_train_asp = logreg_asp_bow.preprocess(X_TRAIN_asp, Y_TRAIN_asp)
X_dev_asp = logreg_asp_bow.preprocess(X_DEV_asp, Y_DEV_asp)

logreg_asp_bow.fit(X_train_asp, Y_TRAIN_asp)

outputs, outputs_prob = logreg_asp_bow.predict(X_dev_asp)

def swapCol(true_df, pred_df):    
    return pd.DataFrame(
        {y_true_col: pred_df[y_true_col] for y_true_col in true_df.columns}
    )
outputs = swapCol(Y_DEV_asp, outputs)
    
utils.get_reports(
    y_true = Y_DEV_asp.reset_index(drop=True), 
    y_pred= outputs
)

Creating new TFIDF vectorizer...
TFIDF matrix: (2034, 3570)
Creating new models
predicting food...
predicting price...
predicting ambience...
predicting service...
predicting anecdotes/miscellaneous...
              precision    recall  f1-score   support

           0       0.50      0.10      0.16        71
           1       0.62      0.83      0.71       261
           2       0.54      0.16      0.24        96
           3       0.60      0.43      0.50       134
           4       0.64      0.85      0.73       227

   micro avg       0.62      0.62      0.62       789
   macro avg       0.58      0.47      0.47       789
weighted avg       0.60      0.62      0.57       789
 samples avg       0.63      0.63      0.62       789



## 2.1) Logistic regression (BOW)

In [9]:
from my_models import aspect

logreg_aspect_bow = aspect.LOGREG(feature_mode='BOW')
X_train_asp = logreg_aspect_bow.preprocess(X_TRAIN_asp, Y_TRAIN_asp)
X_dev_asp = logreg_aspect_bow.preprocess(X_DEV_asp, Y_DEV_asp)

logreg_aspect_bow.fit(X_train_asp, Y_TRAIN_asp)

outputs = logreg_aspect_bow.predict(X_dev_asp)

utils.get_reports(
    y_true = Y_DEV_asp.values, 
    y_pred= outputs
)

Creating new models
Creating new BOW vectorizer...
BOW matrix: (2065, 3633)
predicting service...
predicting food...
predicting anecdotes/miscellaneous...
predicting price...
predicting ambience...
              precision    recall  f1-score   support

           0       0.91      0.68      0.78       106
           1       0.88      0.79      0.84       209
           2       0.77      0.74      0.76       191
           3       0.88      0.54      0.67        56
           4       0.91      0.44      0.59        73

   micro avg       0.85      0.70      0.76       635
   macro avg       0.87      0.64      0.73       635
weighted avg       0.86      0.70      0.76       635
 samples avg       0.72      0.70      0.70       635



  _warn_prf(average, modifier, msg_start, len(result))


## 2.2) Logistic regerssion (TF-IDF)

In [47]:
from my_models import aspect

logreg_aspect_tfidf = aspect.LOGREG(feature_mode='TFIDF')
X_train_asp = logreg_aspect_tfidf.preprocess(X_TRAIN_asp, Y_TRAIN_asp)
X_dev_asp = logreg_aspect_tfidf.preprocess(X_DEV_asp, Y_DEV_asp)

logreg_aspect_tfidf.fit(X_train_asp, Y_TRAIN_asp)

outputs = logreg_aspect_tfidf.predict(X_dev_asp)

utils.get_reports(
    y_true = Y_DEV_asp.values, 
    y_pred= outputs
)

Creating new models
Creating new TFIDF vectorizer...
TFIDF matrix: (2065, 3633)
predicting service...
predicting food...
predicting anecdotes/miscellaneous...
predicting price...
predicting ambience...
              precision    recall  f1-score   support

           0       1.00      0.48      0.65       106
           1       0.91      0.72      0.80       209
           2       0.83      0.59      0.69       191
           3       0.89      0.14      0.25        56
           4       0.89      0.11      0.20        73

   micro avg       0.89      0.52      0.66       635
   macro avg       0.90      0.41      0.52       635
weighted avg       0.90      0.52      0.62       635
 samples avg       0.59      0.55      0.56       635



  _warn_prf(average, modifier, msg_start, len(result))


## 2.3) Bidirectional GRU (GloVe)

In [7]:
Y_TRAIN_asp.columns

Index(['service', 'food', 'anecdotes/miscellaneous', 'price', 'ambience'], dtype='object')

In [55]:
rnn_params = dict(
    rnn_layers=[128,128], 
    dense_layers=[64,64], 
    embedding_matrix=embedding_matrix, 
    n_outputs=1, 
    embedding_trainable=False
)
compile_info = dict(optimizer='adam', loss='binary_crossentropy', metrics='accuracy')

In [56]:
from my_models import aspect
rnn_asp = aspect.dl_pretrained(vocab, compile_info, n_models=len(Y_TRAIN_asp.columns))

In [57]:
from my_models.inference import get_rnn
# Reinstantiate models
rnn_asp.set_model_template(get_rnn(**rnn_params))

Using pretrained word embedding


In [58]:
X_train_asp, Y_train_asp = rnn_asp.preprocess(X_TRAIN_asp, Y_TRAIN_asp, maxtokens = embedding_matrix.shape[0], maxlen=30)
X_dev_asp, Y_dev_asp = rnn_asp.preprocess(X_DEV_asp, Y_DEV_asp, maxtokens = embedding_matrix.shape[0], maxlen=30)

In [59]:
histories = rnn_asp.fit(
                X_train_asp, Y_train_asp, X_dev_asp, Y_dev_asp,
                batch_size = 32, epochs = 5
            )

fitting food ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fitting price ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fitting ambience ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fitting service ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fitting anecdotes/miscellaneous ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [67]:
def predict(X):
        outputs_prob = []
        threshold = 0.5
        for aspect, model in zip(rnn_asp.classes, rnn_asp.models):
            print(f'predicting {aspect}...')
            y_pred_target = model.predict(X)
            #y_pred = tf.cast(y_pred_target > threshold, tf.int32) 
            outputs_prob.append(y_pred_target.ravel())
            
        outputs_prob = np.transpose(np.array(outputs_prob))        
        
        outputs = []
        for row in outputs_prob:
            pred = np.where(row > threshold, 1, 0)
            if np.sum(pred) > 0:
                outputs.append(pred)
            else:
                zeros = np.zeros_like(pred)
                zeros[np.argmax(row)] = 1
                outputs.append(zeros)
        
        outputs_df = pd.DataFrame(np.array(outputs), columns=rnn_asp.classes)
        outputs_prob = pd.DataFrame(np.array(outputs_prob), columns=rnn_asp.classes)

        return outputs_df, outputs_prob

In [75]:
outputs, _ = predict(X_dev_asp)

outputs_df = swapCol(Y_dev_asp, outputs)

utils.get_reports(
    y_true = Y_dev_asp.values, 
    y_pred= outputs_df
) #non-trainable 300d

predicting food...
predicting price...
predicting ambience...
predicting service...
predicting anecdotes/miscellaneous...
              precision    recall  f1-score   support

           0       0.25      0.10      0.14        71
           1       0.61      0.52      0.57       261
           2       0.20      0.23      0.21        96
           3       0.30      0.62      0.41       134
           4       0.71      0.37      0.49       227

   micro avg       0.44      0.42      0.43       789
   macro avg       0.42      0.37      0.36       789
weighted avg       0.51      0.42      0.44       789
 samples avg       0.44      0.43      0.43       789



In [22]:
outputs = rnn_asp.predict(X_dev_asp)

outputs = swapCol(Y_dev_asp, outputs)

utils.get_reports(
    y_true = Y_dev_asp.values, 
    y_pred= outputs
) #non-trainable 300d

predicting service...
predicting food...
predicting anecdotes/miscellaneous...
predicting price...
predicting ambience...
              precision    recall  f1-score   support

           0       0.80      0.71      0.75       106
           1       0.70      0.80      0.75       209
           2       0.69      0.76      0.72       191
           3       0.75      0.48      0.59        56
           4       0.45      0.32      0.37        73

   micro avg       0.69      0.69      0.69       635
   macro avg       0.68      0.61      0.64       635
weighted avg       0.69      0.69      0.68       635
 samples avg       0.67      0.70      0.66       635



  _warn_prf(average, modifier, msg_start, len(result))


## 2.4) CNN (GloVe)

In [41]:
cnn_params = dict(
    n_filters = 64,
    kernel_size = 3,
    n_cnn_layers = 3,
    dense_layers = [64,64],
    embedding_matrix = embedding_matrix,
    n_outputs = 1,
    embedding_trainable=False
)

In [38]:
cnn_asp = aspect.dl_glove(vocab)

In [42]:
# Reinstantiate models
cnn_asp.set_model_template(get_cnn(**cnn_params), n_models = len(Y_TRAIN_asp.columns))

Using pretrained word embedding
cloning model from template...


In [43]:
histories = cnn_asp.fit(
                X_train_asp, Y_train_asp, X_dev_asp, Y_dev_asp,
                batch_size = 32, epochs = 5
            )

fitting service ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fitting food ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fitting anecdotes/miscellaneous ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fitting price ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fitting ambience ...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [45]:
def manual_predict(X, threshold=0.5):
    outputs = []
    threshold = 0.5
    for aspect, model in zip(Y_train_asp.columns, cnn_asp.models):
        print(f'predicting {aspect}...')
        y_pred_target = model.predict(X)
        y_pred = tf.cast(y_pred_target > threshold, tf.int32) 
        outputs.append(y_pred.numpy().ravel())

    outputs = np.transpose(np.array(outputs))
    return outputs

outputs = manual_predict(X_dev_asp)
utils.get_reports(
    y_true = Y_dev_asp.values, 
    y_pred= outputs
) #non-trainable 300d

predicting service...
predicting food...
predicting anecdotes/miscellaneous...
predicting price...
predicting ambience...
              precision    recall  f1-score   support

           0       0.85      0.58      0.69       106
           1       0.87      0.57      0.69       209
           2       0.72      0.67      0.69       191
           3       0.83      0.18      0.29        56
           4       0.44      0.05      0.10        73

   micro avg       0.79      0.51      0.62       635
   macro avg       0.74      0.41      0.49       635
weighted avg       0.77      0.51      0.59       635
 samples avg       0.56      0.53      0.54       635



  _warn_prf(average, modifier, msg_start, len(result))


# $\text{Inference}$

In [68]:
import itertools
from my_models import inference

sent_names = ['logreg_sent_bow','logreg_sent_tfidf','rnn_sent','cnn_sent']
asp_names = ['logreg_aspect','rnn_asp']
model_names = itertools.product(sent_names, asp_names)

sent_models = [
    logreg_sent_bow,
    logreg_sent_tfidf,
    rnn_sent,
    cnn_sent
]

asp_models = [
    logreg_aspect,
    rnn_asp
]
models = itertools.product(sent_models, asp_models)

In [69]:
df_train = pd.read_csv("contest1_train.csv")
df_train_inference = df_train[['id','text']]

for model, name in zip(models, model_names):
    print(name)
    inferencer = inference.InferenceModel(*model)
    outputs = inferencer.predict(df_train_inference)

    outputs.to_csv(f"resulting_predictions/{name[0]}_{name[1]}_train-set.csv")

('logreg_sent_bow', 'logreg_aspect')


TypeError: MulBinary_logreg.preprocess() got an unexpected keyword argument 'vocab_size'

## TRAIN set

In [9]:
from my_models import inference

inferencer = inference.InferenceModel(logreg_sent_bow, logreg_asp_bow)

df_train = pd.read_csv("contest1_train.csv")

df_train_inference = df_train[['id','text']]
outputs = inferencer.predict(df_train_inference)

predicting service...
predicting food...
predicting anecdotes/miscellaneous...
predicting price...
predicting ambience...


In [25]:
path_to_save = "resulting_predictions/"
outputs.to_csv(path_to_save + "train-pred-2-bow-rnn.csv")

In [15]:
y_true = df_train['aspectCategory']+'-'+df_train['polarity']

In [16]:
y_true

0                       service-negative
1                          food-positive
2       anecdotes/miscellaneous-negative
3                          food-positive
4                       service-positive
                      ...               
3151    anecdotes/miscellaneous-positive
3152                    service-positive
3153    anecdotes/miscellaneous-positive
3154                       food-positive
3155                        food-neutral
Length: 3156, dtype: object

In [18]:
y_pred = outputs[1]['aspectCategory']+'-'+outputs[1]['polarity']

In [20]:
outputs[1]
from sklearn.metrics import classification_report

classification_report(y_true=y_true, y_pred=y_pred)

ValueError: Found input variables with inconsistent numbers of samples: [3156, 4261]

In [22]:
outputs[1].to_csv("resulting_predictions/sunday_train_bow_bow.csv")