# $\text{Import data}$

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("contest1_train.csv")
print(df.shape)
df.head()

(3156, 4)


Unnamed: 0,id,text,aspectCategory,polarity
0,3121,But the staff was so horrible to us.,service,negative
1,2777,"To be completely fair, the only redeeming fact...",food,positive
2,2777,"To be completely fair, the only redeeming fact...",anecdotes/miscellaneous,negative
3,1634,"The food is uniformly exceptional, with a very...",food,positive
4,2534,Where Gabriela personaly greets you and recomm...,service,positive


In [3]:
def drop_dups(df, name:str):
    if any(df.duplicated()):
        dups = df[df.duplicated()]

        print(f'{name}\t There are {len(dups)} dups')

        is_drop = input("type 'y' to drop:")
        if is_drop == 'y':
            print("dropping...\n")
            return df.drop_duplicates()
        return df

In [4]:
df = drop_dups(df, "Whole data")

Whole data	 There are 2 dups
type 'y' to drop:y
dropping...



In [5]:
df_aspect = df[['text', 'aspectCategory']]
df_sentiment = df[['text', 'polarity']]

df_aspect = drop_dups(df_aspect, "aspect")
df_sentiment = drop_dups(df_sentiment, "sentiment")

aspect	 There are 2 dups
type 'y' to drop:y
dropping...

sentiment	 There are 427 dups
type 'y' to drop:y
dropping...



# $\text{1. Sentiment}$

In [6]:
from my_models import utils, sentiment

# Drop texts that are duplicated
df_sentiment = df_sentiment.drop_duplicates(subset=['text'], keep='last')

X_TRAIN_sent, X_DEV_sent, Y_TRAIN_sent, Y_DEV_sent = utils.split_data(df_sentiment['text'], df_sentiment['polarity'])

## 1.1) Rule-based

In [7]:
y_pred = sentiment.VADER(df_sentiment['text'])
utils.get_reports(y_true = df_sentiment['polarity'], y_pred=y_pred)

              precision    recall  f1-score   support

    conflict       0.50      0.01      0.01       148
    negative       0.79      0.03      0.06       602
     neutral       0.15      0.98      0.26       354
    positive       0.94      0.14      0.24      1478

    accuracy                           0.22      2582
   macro avg       0.60      0.29      0.14      2582
weighted avg       0.77      0.22      0.19      2582



## 1.2) Logistic regression

In [8]:
logreg = sentiment.LOGREG()

X_train_sent = logreg.preprocess(X_TRAIN_sent.values)
X_dev_sent = logreg.preprocess(X_DEV_sent.values)

logreg.fit(X_train_sent, Y_TRAIN_sent)

Creating new vectorizer...
TF-IDF matrix: (2065, 3589)


In [9]:
y_pred = logreg.predict(X_dev_sent)
utils.get_reports(y_true = Y_DEV_sent, y_pred=y_pred)

              precision    recall  f1-score   support

    conflict       0.00      0.00      0.00        30
    negative       0.66      0.42      0.52       120
     neutral       0.91      0.14      0.24        71
    positive       0.67      0.96      0.79       296

    accuracy                           0.67       517
   macro avg       0.56      0.38      0.39       517
weighted avg       0.66      0.67      0.61       517



## 1.3) Bidirectional GRU

In [10]:
from tensorflow.keras.layers import Dense, Embedding, GRU, Bidirectional
import tensorflow as tf

def get_rnn_sentiment(vocab_size, emb_dim, n_rnn_layers, n_dense_layers, n_outputs):
    layers = [ Embedding(input_dim=vocab_size, output_dim=emb_dim, mask_zero=True) ]
    
    for i in range(n_rnn_layers-1):
        layers.append( Bidirectional(GRU(64, dropout=0.5, return_sequences=True)) )
    layers.append( Bidirectional(GRU(64, dropout=0.5, return_sequences=False)) )
        
    for i in range(n_dense_layers):
        layers.append( Dense(64, activation='relu') )
    layers.append( Dense(n_outputs, activation='sigmoid') )   ## activation sofmax??
    
    model = tf.keras.models.Sequential(layers)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics='accuracy')
    
    return model

In [11]:
model = get_rnn_sentiment(
    vocab_size = 5000,
    emb_dim = 64,
    n_rnn_layers = 3,
    n_dense_layers = 1,
    n_outputs = len(Y_TRAIN_sent.unique())
)

rnn = sentiment.RNN(model)

In [12]:
X_train_sent, Y_train_sent = rnn.preprocess(X_TRAIN_sent.values, Y_TRAIN_sent.values, vocab_size = 5000, maxlen=30)
X_dev_sent, Y_dev_sent = rnn.preprocess(X_DEV_sent.values, Y_DEV_sent.values, vocab_size = 5000, maxlen=30)

...Build new Tokenizer
...Build new LabelEncoder


In [13]:
history = rnn.fit(
    X_train_sent, Y_train_sent, X_dev_sent, Y_dev_sent,
    batch_size = 32, epochs = 5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
y_pred = rnn.predict(X_dev_sent)
utils.get_reports(
    y_true = [rnn.le.classes_[i] for i in Y_dev_sent], 
    y_pred= [rnn.le.classes_[i] for i in y_pred]
)

              precision    recall  f1-score   support

    conflict       0.00      0.00      0.00        30
    negative       0.44      0.72      0.54       120
     neutral       0.00      0.00      0.00        71
    positive       0.76      0.80      0.78       296

    accuracy                           0.62       517
   macro avg       0.30      0.38      0.33       517
weighted avg       0.54      0.62      0.57       517



# $\text{2. Aspect}$

In [15]:
temp_df = pd.pivot_table(
                df_aspect,
                index='text',
                values='aspectCategory',
                aggfunc=lambda x: list(x)
            )

aspects = df_aspect.aspectCategory.unique()
print(aspects)

for a in aspects:
    temp_df[a] = temp_df.apply(lambda x: 1 if a in x.aspectCategory else 0, axis=1)
    
temp_df.head()

['service' 'food' 'anecdotes/miscellaneous' 'price' 'ambience']


Unnamed: 0_level_0,aspectCategory,service,food,anecdotes/miscellaneous,price,ambience
text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"$160 for 2 filets, 2 sides, an appetizer and drinks.","[food, price]",0,1,0,1,0
$20 for all you can eat sushi cannot be beaten.,[price],0,0,0,1,0
$20 gets you unlimited sushi of a very high quality- I even took a friend here from Japan who said it was one of the best sushi places in the US that he has been to.,"[food, price]",0,1,0,1,0
"($200 for 2 glasses of champagne, not too expensive bottle of wine and 2 after dinner drinks).",[price],0,0,0,1,0
(Always ask the bartender for the SEASONAL beer!!!,[food],0,1,0,0,0


In [16]:
X_TRAIN_asp, X_DEV_asp, Y_TRAIN_asp, Y_DEV_asp = utils.split_data(temp_df.index, temp_df.iloc[:, -5:], stratify=False)

## 2.1) Logistic regression

In [17]:
from my_models import aspect

logreg_aspect = aspect.MulBinary_logreg()
X_train_asp = logreg_aspect.preprocess(X_TRAIN_asp, Y_TRAIN_asp)
X_dev_asp = logreg_aspect.preprocess(X_DEV_asp, Y_DEV_asp)

Creating new models
Creating new vectorizer...
TF-IDF matrix: (2065, 3633)


In [18]:
logreg_aspect.fit(X_train_asp, Y_TRAIN_asp)

In [19]:
outputs = logreg_aspect.predict(X_dev_asp)
outputs

predicting service...
predicting food...
predicting anecdotes/miscellaneous...
predicting price...
predicting ambience...


array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0],
       ...,
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0]])

In [20]:
utils.get_reports(
    y_true = Y_DEV_asp.values, 
    y_pred= outputs
)

              precision    recall  f1-score   support

           0       1.00      0.48      0.65       106
           1       0.91      0.72      0.80       209
           2       0.83      0.59      0.69       191
           3       0.89      0.14      0.25        56
           4       0.89      0.11      0.20        73

   micro avg       0.89      0.52      0.66       635
   macro avg       0.90      0.41      0.52       635
weighted avg       0.90      0.52      0.62       635
 samples avg       0.59      0.55      0.56       635



  _warn_prf(average, modifier, msg_start, len(result))


## 2.2) Bidirectional GRU

In [21]:
def get_rnn_asp(vocab_size, emb_dim, n_rnn_layers, n_dense_layers):
    layers = [ Embedding(input_dim=vocab_size, output_dim=emb_dim, mask_zero=True) ]
    
    for i in range(n_rnn_layers-1):
        layers.append( Bidirectional(GRU(64, dropout=0.5, return_sequences=True)) )
    layers.append( Bidirectional(GRU(64, dropout=0.5, return_sequences=False)) )
        
    for i in range(n_dense_layers):
        layers.append( Dense(64, activation='relu') )
    layers.append( Dense(1, activation='sigmoid') )
    
    model = tf.keras.models.Sequential(layers)
    model.compile()
    
    return model

In [22]:
compiled_template_model = get_rnn_asp(
    vocab_size = 5000,
    emb_dim = 64,
    n_rnn_layers = 3,
    n_dense_layers = 1
)
multi_binary = aspect.MulBinary_rnn(compiled_template_model)

In [23]:
X_train_asp, Y_train_asp = multi_binary.preprocess(X_TRAIN_asp, Y_TRAIN_asp, vocab_size = 5000, maxlen=30)
X_dev_asp, Y_dev_asp = multi_binary.preprocess(X_DEV_asp, Y_DEV_asp, vocab_size = 5000, maxlen=30)

...Build new Tokenizer
cloning model from template...


In [24]:
histories = multi_binary.fit(
                X_train_asp, Y_train_asp, X_dev_asp, Y_dev_asp,
                batch_size = 32, epochs = 3
            )

fitting service ...

Epoch 1/3
Epoch 2/3
Epoch 3/3
fitting food ...

Epoch 1/3
Epoch 2/3
Epoch 3/3
fitting anecdotes/miscellaneous ...

Epoch 1/3
Epoch 2/3
Epoch 3/3
fitting price ...

Epoch 1/3
Epoch 2/3
Epoch 3/3
fitting ambience ...

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [25]:
outputs = multi_binary.predict(X_dev_asp)
utils.get_reports(
    y_true = Y_dev_asp.values, 
    y_pred= outputs
)

predicting service...
predicting food...
predicting anecdotes/miscellaneous...
predicting price...
predicting ambience...
              precision    recall  f1-score   support

           0       0.69      0.79      0.74       106
           1       0.83      0.83      0.83       209
           2       0.83      0.67      0.74       191
           3       0.75      0.68      0.71        56
           4       0.60      0.64      0.62        73

   micro avg       0.77      0.74      0.75       635
   macro avg       0.74      0.72      0.73       635
weighted avg       0.77      0.74      0.75       635
 samples avg       0.73      0.75      0.72       635



  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
from my_models import inference
inferencer = inference.InferenceModel(logreg, multi_binary)

df_test = pd.read_csv("contest1_test.csv")

df_train_inference = df_test[['id','text']]
outputs = inferencer.predict(df_train_inference)

predicting service...
predicting food...
predicting anecdotes/miscellaneous...
predicting price...
predicting ambience...


In [31]:
outputs

Unnamed: 0_level_0,aspectCategory,polarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1
899,service,positive
899,food,positive
1349,anecdotes/miscellaneous,positive
1349,ambience,positive
934,food,positive
...,...,...
1063,anecdotes/miscellaneous,positive
777,food,positive
875,anecdotes/miscellaneous,positive
671,food,positive


In [32]:
outputs.to_csv("test-pred-1.csv")