<p align="center"><img width="50%" src="https://aimodelsharecontent.s3.amazonaws.com/aimodshare_banner.jpg" /></p>


In [None]:
#install aimodelshare library
! pip install aimodelshare==0.0.189

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting protobuf==3.19.6
  Using cached protobuf-3.19.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
Collecting tensorflow==2.9.2
  Using cached tensorflow-2.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (511.8 MB)
Collecting flatbuffers<2,>=1.12
  Using cached flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
Collecting tensorboard<2.10,>=2.9
  Using cached tensorboard-2.9.1-py3-none-any.whl (5.8 MB)
Collecting tensorflow-estimator<2.10.0,>=2.9.0rc0
  Using cached tensorflow_estimator-2.9.0-py2.py3-none-any.whl (438 kB)
Collecting keras<2.10.0,>=2.9.0rc0
  Using cached keras-2.9.0-py2.py3-none-any.whl (1.6 MB)
Collecting tensorboard-data-server<0.7.0,>=0.6.0
  Using cached tensorboard_data_server-0.6.1-py3-none-manylinux2010_x86_64.whl (4.9 MB)
Collecting google-auth-oauthlib<0.5,>=0.4.1
  Using cached google_auth_oauthlib-0.4.6-py2.py3-none-any.whl (18

## Loading and Preprocessing Data

In [None]:
# Get competition data
from aimodelshare import download_data
download_data('public.ecr.aws/y2e2a1d6/sst2_competition_data-repository:latest') 


Data downloaded successfully.


In [None]:
import pandas as pd
import numpy as np
import warnings
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
warnings.simplefilter(action='ignore', category=Warning)

X_train = pd.read_csv("sst2_competition_data/X_train.csv", squeeze=True)
X_test = pd.read_csv("sst2_competition_data/X_test.csv", squeeze=True)
y_train_labels = pd.read_csv("sst2_competition_data/y_train_labels.csv", squeeze=True)

In [None]:
y_train_binary = y_train_labels.map({'Negative': 0, 'Positive': 1})


In [None]:
X_train.head()

0    The Rock is destined to be the 21st Century 's...
1    The gorgeously elaborate continuation of `` Th...
2    Singer/composer Bryan Adams contributes a slew...
3                 Yet the act is still charming here .
4    Whether or not you 're enlightened by any of D...
Name: text, dtype: object

Here I made 2 changes to the tokenizer:

1. I added a `oov_token` to handle the case when the data has words that are not in the vocabulary.
2. I increased the sequence max length from 40 to 64, because I found that the max length in the dataset is 52, so using 40 will have some sequence cut off which will definitely harm the performance.

In [None]:
# Tokenize the text data
max_features = 10000
tokenizer = Tokenizer(num_words=max_features, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

# Convert the text to sequences and pad them
# max_len = 64  # this is sufficient because the maximum length in training/test set is 52.
# X_train_seq = tokenizer.texts_to_sequences(X_train)
# X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)

# X_test_seq = tokenizer.texts_to_sequences(X_test)
# X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

In [None]:
def preprocessor(data, maxlen=64, max_words=10000):

    sequences = tokenizer.texts_to_sequences(data)

    word_index = tokenizer.word_index
    X = pad_sequences(sequences, maxlen=maxlen)

    return X

In [None]:
X_train_pad = preprocessor(X_train)
X_test_pad = preprocessor(X_test)

In [None]:
X_train_pad.shape, X_test_pad.shape

((6920, 64), (1821, 64))

In [None]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_pad, y_train_binary, test_size=0.2, random_state=42)

## Model 1: Embedding + LSTM

The first model I implemented is an embedding layer followed by two LSTM layers, as instructed in the requirements. The model structure is self-explained by the code below. We used the Adam optimizer and enabled early stopping and best model saving during the training.

This model received a 68.61% accuracy, ranked 42.

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint


In [None]:
embedding_dim = 128
lstm_units = 64

model = Sequential([
    Embedding(max_features, embedding_dim, input_length=64),
    LSTM(lstm_units, return_sequences=True),
    LSTM(lstm_units),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=5)
checkpoint_filepath = 'best_model.h5'
model_checkpoint = ModelCheckpoint(checkpoint_filepath, monitor='val_loss', save_best_only=True, mode='min', verbose=1)

history = model.fit(X_train_split, y_train_split, epochs=50, batch_size=64, validation_data=(X_val_split, y_val_split), callbacks=[early_stop, model_checkpoint])

Epoch 1/50
Epoch 1: val_loss improved from inf to 0.47022, saving model to best_model.h5
Epoch 2/50
Epoch 2: val_loss did not improve from 0.47022
Epoch 3/50
Epoch 3: val_loss did not improve from 0.47022
Epoch 4/50
Epoch 4: val_loss did not improve from 0.47022
Epoch 5/50
Epoch 5: val_loss did not improve from 0.47022
Epoch 6/50
Epoch 6: val_loss did not improve from 0.47022


In [None]:
from tensorflow.keras.models import load_model

best_model = load_model('best_model.h5')

In [None]:
import aimodelshare as ai
ai.export_preprocessor(preprocessor,"") 

Your preprocessor is now saved to 'preprocessor.zip'


In [None]:
# Save keras model to local ONNX file
from aimodelshare.aimsonnx import model_to_onnx

onnx_model = model_to_onnx(best_model, framework='keras',
                          transfer_learning=False,
                          deep_learning=True)

with open("LSTM_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

In [None]:
from aimodelshare.aws import set_credentials
    
apiurl="https://rlxjxnoql9.execute-api.us-east-1.amazonaws.com/prod/m" #This is the unique rest api that powers this specific Playground

set_credentials(apiurl=apiurl)

AI Modelshare Username:··········
AI Modelshare Password:··········
AI Model Share login credentials set successfully.


In [None]:
import aimodelshare as ai
mycompetition= ai.Competition(apiurl)

In [None]:
# Submit Model 1:

# Generate predicted y values (Model 1)
y_test_pred = best_model.predict(X_test_pad)

# Convert predicted probabilities to binary labels (0 for 'Negative', 1 for 'Positive')
y_test_pred_labels = (y_test_pred > 0.5).astype(np.int)

# Map binary labels back to the original string labels
prediction_labels = ['Negative' if label == 0 else 'Positive' for label in y_test_pred_labels]

# Submit Model 1 to Competition Leaderboard
mycompetition.submit_model(model_filepath = "LSTM_model.onnx",
                                 preprocessor_filepath="preprocessor.zip",
                                 prediction_submission=prediction_labels)

Insert search tags to help users find your model (optional): lstm-rian
Provide any useful notes about your model (optional): first submission

Your model has been submitted as model version 74

To submit code used to create this model or to view current leaderboard navigate to Model Playground: 

 https://www.modelshare.org/detail/model:2763


This model ranked 42nd.

In [None]:
 # Get leaderboard to explore current best model architectures

# Get raw data in pandas data frame
data = mycompetition.get_leaderboard()

# Stylize leaderboard data
mycompetition.stylize_leaderboard(data)

Unnamed: 0,accuracy,f1_score,precision,recall,ml_framework,transfer_learning,deep_learning,model_type,depth,num_params,embedding_layers,conv1d_layers,maxpooling1d_layers,dropout_layers,flatten_layers,lstm_layers,inputlayer_layers,bidirectional_layers,globalmaxpooling1d_layers,globalaveragepooling1d_layers,dense_layers,sigmoid_act,softmax_act,tanh_act,relu_act,loss,optimizer,memory_size,team,username,version
0,81.78%,81.78%,81.79%,81.78%,keras,,True,Sequential,6.0,963856.0,1.0,1.0,1.0,,1.0,,,,,,2.0,,1.0,,2.0,str,Adam,3856424.0,,francesyang,66
1,81.78%,81.78%,81.78%,81.78%,keras,,True,Sequential,3.0,2168577.0,1.0,,,,,1.0,,,,,1.0,1.0,,1.0,,function,Adam,8675184.0,,rian,76
2,80.90%,80.89%,80.96%,80.90%,keras,,True,Sequential,4.0,640130.0,1.0,,,1.0,,,,,,1.0,1.0,,1.0,,,str,RMSprop,3111632.0,,chachagsedaro,26
3,80.57%,80.35%,82.02%,80.58%,keras,,True,Sequential,4.0,201154.0,1.0,,,,1.0,,,,,,2.0,,2.0,,,str,RMSprop,805360.0,,1jiahe,46
4,80.57%,80.50%,81.06%,80.58%,keras,,True,Sequential,4.0,327266.0,1.0,1.0,,,,,,,1.0,,1.0,,1.0,,1.0,str,RMSprop,1309824.0,,mymstella,71
5,79.80%,79.63%,80.85%,79.81%,keras,,True,Sequential,5.0,193702.0,1.0,,,,1.0,2.0,,,,,1.0,,1.0,2.0,,str,RMSprop,776112.0,,amsay99,43
6,80.13%,80.13%,80.16%,80.13%,keras,,True,Sequential,4.0,640130.0,1.0,,,1.0,,,,,,1.0,1.0,,1.0,,,str,RMSprop,3111632.0,,chachagsedaro,25
7,79.58%,79.46%,80.34%,79.59%,keras,,True,Sequential,4.0,206850.0,1.0,,,,1.0,1.0,,,,,1.0,,1.0,1.0,,str,RMSprop,828272.0,,jer2240,51
8,79.25%,79.06%,80.41%,79.26%,keras,,True,Sequential,5.0,174658.0,1.0,,,,1.0,2.0,,,,,1.0,,1.0,2.0,,str,RMSprop,699936.0,7.0,lprockop,55
9,79.25%,79.21%,79.52%,79.26%,keras,,True,Sequential,5.0,287402.0,1.0,,,,1.0,2.0,,,,,1.0,,1.0,2.0,,str,RMSprop,1150912.0,,amsay99,60


## Model 2: Embedding + CNN

This model is an embedding layer followed by a 1D CNN layer and a global max pooling layer. It is also required in the instructions. Other model specs are the same as the above LSTM model. This model got a 72.89% accuracy, ranked 35th on the leaderboard at the time when I submitted it.

In [None]:
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D

In [None]:
embedding_dim = 256
num_filters = 64
kernel_size = 3

cnn_model = Sequential([
    Embedding(max_features, embedding_dim, input_length=64),
    Conv1D(num_filters, kernel_size, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
cnn_checkpoint_filepath = 'cnn_best_model.h5'
cnn_model_checkpoint = ModelCheckpoint(cnn_checkpoint_filepath, monitor='val_loss', save_best_only=True, mode='min', verbose=1)

cnn_history = cnn_model.fit(X_train_split, y_train_split, epochs=30, batch_size=64, validation_data=(X_val_split, y_val_split), callbacks=[cnn_model_checkpoint])


Epoch 1/30
Epoch 1: val_loss improved from inf to 0.49190, saving model to cnn_best_model.h5
Epoch 2/30
Epoch 2: val_loss improved from 0.49190 to 0.44206, saving model to cnn_best_model.h5
Epoch 3/30
Epoch 3: val_loss did not improve from 0.44206
Epoch 4/30
Epoch 4: val_loss did not improve from 0.44206
Epoch 5/30
Epoch 5: val_loss did not improve from 0.44206
Epoch 6/30
Epoch 6: val_loss did not improve from 0.44206
Epoch 7/30
Epoch 7: val_loss did not improve from 0.44206
Epoch 8/30
Epoch 8: val_loss did not improve from 0.44206
Epoch 9/30
Epoch 9: val_loss did not improve from 0.44206
Epoch 10/30
Epoch 10: val_loss did not improve from 0.44206
Epoch 11/30
Epoch 11: val_loss did not improve from 0.44206
Epoch 12/30
Epoch 12: val_loss did not improve from 0.44206
Epoch 13/30
Epoch 13: val_loss did not improve from 0.44206
Epoch 14/30
Epoch 14: val_loss did not improve from 0.44206
Epoch 15/30
Epoch 15: val_loss did not improve from 0.44206
Epoch 16/30
Epoch 16: val_loss did not impro

In [None]:
from tensorflow.keras.models import load_model

best_model = load_model('cnn_best_model.h5')

In [None]:
# Save keras model to local ONNX file
from aimodelshare.aimsonnx import model_to_onnx

onnx_model = model_to_onnx(best_model, framework='keras',
                          transfer_learning=False,
                          deep_learning=True)

with open("CNN_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

In [None]:
# Submit Model 2:

# Generate predicted y values (Model 2)
y_test_pred = best_model.predict(X_test_pad)

# Convert predicted probabilities to binary labels (0 for 'Negative', 1 for 'Positive')
y_test_pred_labels = (y_test_pred > 0.5).astype(np.int)

# Map binary labels back to the original string labels
prediction_labels = ['Negative' if label == 0 else 'Positive' for label in y_test_pred_labels]

# Submit Model 1 to Competition Leaderboard
mycompetition.submit_model(model_filepath = "CNN_model.onnx",
                                 preprocessor_filepath="preprocessor.zip",
                                 prediction_submission=prediction_labels)

Insert search tags to help users find your model (optional): c n n
Provide any useful notes about your model (optional): second cnn

Your model has been submitted as model version 77

To submit code used to create this model or to view current leaderboard navigate to Model Playground: 

 https://www.modelshare.org/detail/model:2763


In [None]:
# This model ranks 35th.

In [None]:
 # Get leaderboard to explore current best model architectures

# Get raw data in pandas data frame
data = mycompetition.get_leaderboard()

# Stylize leaderboard data
mycompetition.stylize_leaderboard(data)

Unnamed: 0,accuracy,f1_score,precision,recall,ml_framework,transfer_learning,deep_learning,model_type,depth,num_params,embedding_layers,conv1d_layers,maxpooling1d_layers,dropout_layers,flatten_layers,lstm_layers,inputlayer_layers,bidirectional_layers,globalmaxpooling1d_layers,globalaveragepooling1d_layers,dense_layers,sigmoid_act,softmax_act,tanh_act,relu_act,loss,optimizer,memory_size,team,username,version
0,81.78%,81.78%,81.79%,81.78%,keras,,True,Sequential,6.0,963856.0,1.0,1.0,1.0,,1.0,,,,,,2.0,,1.0,,2.0,str,Adam,3856424.0,,francesyang,66
1,81.78%,81.78%,81.78%,81.78%,keras,,True,Sequential,3.0,2168577.0,1.0,,,,,1.0,,,,,1.0,1.0,,1.0,,function,Adam,8675184.0,,rian,76
2,80.90%,80.89%,80.96%,80.90%,keras,,True,Sequential,4.0,640130.0,1.0,,,1.0,,,,,,1.0,1.0,,1.0,,,str,RMSprop,3111632.0,,chachagsedaro,26
3,80.57%,80.35%,82.02%,80.58%,keras,,True,Sequential,4.0,201154.0,1.0,,,,1.0,,,,,,2.0,,2.0,,,str,RMSprop,805360.0,,1jiahe,46
4,80.57%,80.50%,81.06%,80.58%,keras,,True,Sequential,4.0,327266.0,1.0,1.0,,,,,,,1.0,,1.0,,1.0,,1.0,str,RMSprop,1309824.0,,mymstella,71
5,79.80%,79.63%,80.85%,79.81%,keras,,True,Sequential,5.0,193702.0,1.0,,,,1.0,2.0,,,,,1.0,,1.0,2.0,,str,RMSprop,776112.0,,amsay99,43
6,80.13%,80.13%,80.16%,80.13%,keras,,True,Sequential,4.0,640130.0,1.0,,,1.0,,,,,,1.0,1.0,,1.0,,,str,RMSprop,3111632.0,,chachagsedaro,25
7,79.58%,79.46%,80.34%,79.59%,keras,,True,Sequential,4.0,206850.0,1.0,,,,1.0,1.0,,,,,1.0,,1.0,1.0,,str,RMSprop,828272.0,,jer2240,51
8,79.25%,79.06%,80.41%,79.26%,keras,,True,Sequential,5.0,174658.0,1.0,,,,1.0,2.0,,,,,1.0,,1.0,2.0,,str,RMSprop,699936.0,7.0,lprockop,55
9,79.25%,79.21%,79.52%,79.26%,keras,,True,Sequential,5.0,287402.0,1.0,,,,1.0,2.0,,,,,1.0,,1.0,2.0,,str,RMSprop,1150912.0,,amsay99,60


## Model 3: Transfer Learning with GloVE

For this model, we still use an emebdding layer with an LSTM layer. But this time we use the pretrained GloVE word vector as the intialization for the embedding layer. Specifically, we use the 200-D pretrained vectors. This model uses transfer learning with GloVE as instructed.

This model got a **shared first place** at the time when I submitted it, achieved a 81.78% accuracy on the test set, though I beated myself with my fourth model (see below).


In [None]:
!curl -L -o glove.6B.zip https://huggingface.co/stanfordnlp/glove/resolve/main/glove.6B.zip


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1048  100  1048    0     0  15411      0 --:--:-- --:--:-- --:--:-- 15411
100  822M  100  822M    0     0  59.3M      0  0:00:13  0:00:13 --:--:-- 58.1M


In [None]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       
  inflating: glove.6B.50d.txt        


In [None]:
glove_path = 'glove.6B.200d.txt'

embeddings_index = {}
with open(glove_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs


In [None]:
embedding_dim = 200

embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_features:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.071549  ,  0.093459  ,  0.023738  , ...,  0.33616999,
         0.030591  ,  0.25577   ],
       ...,
       [-0.088186  ,  0.27678001,  0.031382  , ...,  0.085006  ,
         0.20454   , -0.74707001],
       [ 0.42379001,  0.19716001,  0.012409  , ...,  0.15421   ,
        -0.33065999,  0.98264003],
       [ 0.11481   , -0.1336    ,  0.47916001, ..., -0.47672001,
        -0.33217999,  0.56616002]])

In [None]:
from tensorflow.keras.layers import LSTM

glove_model = Sequential([
    Embedding(max_features, embedding_dim, input_length=64, weights=[embedding_matrix], trainable=False),
    LSTM(128),
    Dense(1, activation='sigmoid')
])

glove_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
glove_checkpoint_filepath = 'glove_best_model.h5'
glove_model_checkpoint = ModelCheckpoint(glove_checkpoint_filepath, monitor='val_loss', save_best_only=True, mode='min', verbose=1)

glove_history = glove_model.fit(X_train_split, y_train_split, epochs=30, batch_size=64, validation_data=(X_val_split, y_val_split), callbacks=[glove_model_checkpoint])


Epoch 1/30
Epoch 1: val_loss improved from inf to 0.50988, saving model to glove_best_model.h5
Epoch 2/30
Epoch 2: val_loss improved from 0.50988 to 0.46542, saving model to glove_best_model.h5
Epoch 3/30
Epoch 3: val_loss did not improve from 0.46542
Epoch 4/30
Epoch 4: val_loss did not improve from 0.46542
Epoch 5/30
Epoch 5: val_loss improved from 0.46542 to 0.42982, saving model to glove_best_model.h5
Epoch 6/30
Epoch 6: val_loss did not improve from 0.42982
Epoch 7/30
Epoch 7: val_loss did not improve from 0.42982
Epoch 8/30
Epoch 8: val_loss did not improve from 0.42982
Epoch 9/30
Epoch 9: val_loss did not improve from 0.42982
Epoch 10/30
Epoch 10: val_loss did not improve from 0.42982
Epoch 11/30
Epoch 11: val_loss did not improve from 0.42982
Epoch 12/30
Epoch 12: val_loss did not improve from 0.42982
Epoch 13/30
Epoch 13: val_loss did not improve from 0.42982
Epoch 14/30
Epoch 14: val_loss did not improve from 0.42982
Epoch 15/30
Epoch 15: val_loss did not improve from 0.42982

In [None]:
from tensorflow.keras.models import load_model

best_model = load_model('glove_best_model.h5')

In [None]:
y_val_split

468     1
1956    1
800     1
6474    0
2389    1
       ..
1586    1
4272    0
2283    1
1477    1
5154    0
Name: label, Length: 1384, dtype: int64

In [None]:
X_val_split.shape, y_val_split.shape
y_val_pred = best_model.predict(X_val_split)
y_val_pred.shape



(1384, 1)

In [None]:
# Convert predicted probabilities to binary labels (0 for 'Negative', 1 for 'Positive')
y_val_pred_labels = (y_val_pred > 0.5).astype(np.int)

# Calculate the accuracy on the validation set
from sklearn.metrics import accuracy_score

val_accuracy_glove = accuracy_score(y_val_split, y_val_pred_labels)
print(f"Validation accuracy (GloVe): {val_accuracy_glove}")


Validation accuracy (GloVe): 0.8063583815028902


In [None]:
# Save keras model to local ONNX file
from aimodelshare.aimsonnx import model_to_onnx

onnx_model = model_to_onnx(best_model, framework='keras',
                          transfer_learning=False,
                          deep_learning=True)

with open("glove_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

In [None]:
# Submit Model 3:

# Generate predicted y values (Model 3)
y_test_pred = best_model.predict(X_test_pad)

# Convert predicted probabilities to binary labels (0 for 'Negative', 1 for 'Positive')
y_test_pred_labels = (y_test_pred > 0.5).astype(np.int)

# Map binary labels back to the original string labels
prediction_labels = ['Negative' if label == 0 else 'Positive' for label in y_test_pred_labels]

# Submit Model 1 to Competition Leaderboard
mycompetition.submit_model(model_filepath = "glove_model.onnx",
                                 preprocessor_filepath="preprocessor.zip",
                                 prediction_submission=prediction_labels)

Insert search tags to help users find your model (optional): glove
Provide any useful notes about your model (optional): third

Your model has been submitted as model version 76

To submit code used to create this model or to view current leaderboard navigate to Model Playground: 

 https://www.modelshare.org/detail/model:2763


This model got a shared first place at the time when I finished it (**Spoiler**: will be surpassed by my next model).

In [None]:
 # Get leaderboard to explore current best model architectures

# Get raw data in pandas data frame
data = mycompetition.get_leaderboard()

# Stylize leaderboard data
mycompetition.stylize_leaderboard(data)

Unnamed: 0,accuracy,f1_score,precision,recall,ml_framework,transfer_learning,deep_learning,model_type,depth,num_params,embedding_layers,conv1d_layers,maxpooling1d_layers,dropout_layers,flatten_layers,lstm_layers,inputlayer_layers,bidirectional_layers,globalmaxpooling1d_layers,globalaveragepooling1d_layers,dense_layers,sigmoid_act,softmax_act,tanh_act,relu_act,loss,optimizer,memory_size,team,username,version
0,81.78%,81.78%,81.79%,81.78%,keras,,True,Sequential,6.0,963856.0,1.0,1.0,1.0,,1.0,,,,,,2.0,,1.0,,2.0,str,Adam,3856424.0,,francesyang,66
1,81.78%,81.78%,81.78%,81.78%,keras,,True,Sequential,3.0,2168577.0,1.0,,,,,1.0,,,,,1.0,1.0,,1.0,,function,Adam,8675184.0,,rian,76
2,80.57%,80.35%,82.02%,80.58%,keras,,True,Sequential,4.0,201154.0,1.0,,,,1.0,,,,,,2.0,,2.0,,,str,RMSprop,805360.0,,1jiahe,46
3,80.90%,80.89%,80.96%,80.90%,keras,,True,Sequential,4.0,640130.0,1.0,,,1.0,,,,,,1.0,1.0,,1.0,,,str,RMSprop,3111632.0,,chachagsedaro,26
4,80.57%,80.50%,81.06%,80.58%,keras,,True,Sequential,4.0,327266.0,1.0,1.0,,,,,,,1.0,,1.0,,1.0,,1.0,str,RMSprop,1309824.0,,mymstella,71
5,79.80%,79.63%,80.85%,79.81%,keras,,True,Sequential,5.0,193702.0,1.0,,,,1.0,2.0,,,,,1.0,,1.0,2.0,,str,RMSprop,776112.0,,amsay99,43
6,80.13%,80.13%,80.16%,80.13%,keras,,True,Sequential,4.0,640130.0,1.0,,,1.0,,,,,,1.0,1.0,,1.0,,,str,RMSprop,3111632.0,,chachagsedaro,25
7,79.58%,79.46%,80.34%,79.59%,keras,,True,Sequential,4.0,206850.0,1.0,,,,1.0,1.0,,,,,1.0,,1.0,1.0,,str,RMSprop,828272.0,,jer2240,51
8,79.25%,79.06%,80.41%,79.26%,keras,,True,Sequential,5.0,174658.0,1.0,,,,1.0,2.0,,,,,1.0,,1.0,2.0,,str,RMSprop,699936.0,7.0,lprockop,55
9,79.25%,79.21%,79.52%,79.26%,keras,,True,Sequential,5.0,287402.0,1.0,,,,1.0,2.0,,,,,1.0,,1.0,2.0,,str,RMSprop,1150912.0,,amsay99,60


## Model 4: (Extra) Fine-tuning Bert

Although this aspect may not be critical to the project, I was interested in determining the upper bound of deep learning models for sentiment analysis. As such, I utilized a fine-tuned BERT model for the task, recognizing that it may be an overkill for a dataset of this size.

We obtained the model from HuggingFace's transformer library. The tokenizer and model we used are 'distilbert-base-uncased-finetuned-sst-2-english' and 'distilbert-base-uncased-finetuned-sst-2-english' respectively.

The model is relatively large, and we realized that fine-tuning it by adding a small dense layer would take a long time to train and could even harm its performance. Therefore, we decided to remove the additional layer and used the pretrained model for prediction directly.

The results were a significant improvement, as we not only achieved the first place, but also got an accuracy of 92.32%, leading the leaderboard by a large margin of over 10%.

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install --upgrade "tensorflow>=2.11"


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow>=2.11
  Downloading tensorflow-2.12.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (585.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m585.9/585.9 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-estimator<2.13,>=2.12.0
  Downloading tensorflow_estimator-2.12.0-py2.py3-none-any.whl (440 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m440.7/440.7 kB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
Collecting flatbuffers>=2.0
  Downloading flatbuffers-23.3.3-py2.py3-none-any.whl (26 kB)
Collecting tensorboard<2.13,>=2.12
  Downloading tensorboard-2.12.1-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3
  Downloading prot

In [None]:
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

warnings.simplefilter(action='ignore', category=Warning)

X_train = pd.read_csv("sst2_competition_data/X_train.csv", squeeze=True)
X_test = pd.read_csv("sst2_competition_data/X_test.csv", squeeze=True)
y_train_labels = pd.read_csv("sst2_competition_data/y_train_labels.csv", squeeze=True)

y_train = (y_train_labels == "Positive").astype(int)  # Convert labels to binary format

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)



In [None]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, TextClassificationPipeline

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
model = TFAutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')


Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_57']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, TextClassificationPipeline

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
model = TFAutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')


Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_77']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
val_predictions = pipeline(X_val_split.tolist())
val_prediction_labels = ['Negative' if pred['label'] == 'NEGATIVE' else 'Positive' for pred in val_predictions]
y_val_labels = (y_val_split == 0).replace({True: 'Negative', False: 'Positive'})

val_accuracy = accuracy_score(y_val_labels, val_prediction_labels)
print(f"Validation accuracy: {val_accuracy}")


Validation accuracy: 0.9898843930635838


In [None]:
test_predictions = pipeline(X_test.tolist())
test_prediction_labels = ['Negative' if pred['label'] == 'NEGATIVE' else 'Positive' for pred in test_predictions]


In [None]:
import os

if not os.path.exists("onnx_models"):
    os.makedirs("onnx_models")

In [None]:
onnx_model_path = Path("onnx_models/distilbert_sentiment.onnx")

In [None]:
!pip install tf2onnx

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flatbuffers<3.0,>=1.12
  Downloading flatbuffers-2.0.7-py2.py3-none-any.whl (26 kB)
Collecting protobuf<=3.20.1,>=3.12.2
  Downloading protobuf-3.20.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: flatbuffers, protobuf
  Attempting uninstall: flatbuffers
    Found existing installation: flatbuffers 23.3.3
    Uninstalling flatbuffers-23.3.3:
      Successfully uninstalled flatbuffers-23.3.3
  Attempting uninstall: protobuf
    Found existing installation: protobuf 4.22.3
    Uninstalling protobuf-4.22.3:
      Successfully uninstalled protobuf-4.22.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependenc

In [None]:
model.save_pretrained("distilbert_tf_saved_model", saved_model=True)




In [None]:
!python -m tf2onnx.convert --saved-model distilbert_tf_saved_model/saved_model/1 --output distilbert_sentiment.onnx --opset 12

2023-04-13 08:24:53.301565: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-04-13 08:24:57.833687: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 93763584 exceeds 10% of free system memory.
2023-04-13 08:24:58,130 - INFO - Signatures found in model: [serving_default,int64_serving].
2023-04-13 08:24:58,131 - INFO - Output names: ['logits']
2023-04-13 08:25:09.821271: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 93763584 exceeds 10% of free system memory.
2023-04-13 08:25:09.821932: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 93763584 exceeds 10% of free system memory.
^C


In [None]:
tokenizer.save_pretrained("distilbert_tokenizer")

('distilbert_tokenizer/tokenizer_config.json',
 'distilbert_tokenizer/special_tokens_map.json',
 'distilbert_tokenizer/vocab.txt',
 'distilbert_tokenizer/added_tokens.json',
 'distilbert_tokenizer/tokenizer.json')

In [None]:
import shutil

shutil.make_archive("distilbert_tokenizer", 'zip', 'distilbert_tokenizer')


'/content/distilbert_tokenizer.zip'

In [None]:
mycompetition.submit_model(model_filepath=None,
                           preprocessor_filepath="distilbert_tokenizer.zip",
                           prediction_submission=lines)

  leaderboard = leaderboard.append(metadata, ignore_index=True, sort=False)
  leaderboard['username']=leaderboard.pop("username")
  leaderboard['timestamp'] = leaderboard.pop("timestamp")
  leaderboard['version'] = leaderboard.pop("version")
  leaderboard = leaderboard.append(metadata, ignore_index=True, sort=False)
  leaderboard['username']=leaderboard.pop("username")
  leaderboard['timestamp'] = leaderboard.pop("timestamp")
  leaderboard['version'] = leaderboard.pop("version")


Insert search tags to help users find your model (optional): distll-bert
Provide any useful notes about your model (optional): pretrained

Your model has been submitted as model version 78

To submit code used to create this model or to view current leaderboard navigate to Model Playground: 

 https://www.modelshare.org/detail/model:2763


In [None]:
 # Get leaderboard to explore current best model architectures

# Get raw data in pandas data frame
data = mycompetition.get_leaderboard()

# Stylize leaderboard data
mycompetition.stylize_leaderboard(data)

Unnamed: 0,accuracy,f1_score,precision,recall,ml_framework,transfer_learning,deep_learning,model_type,depth,num_params,embedding_layers,conv1d_layers,maxpooling1d_layers,dropout_layers,flatten_layers,lstm_layers,inputlayer_layers,bidirectional_layers,globalmaxpooling1d_layers,globalaveragepooling1d_layers,dense_layers,sigmoid_act,softmax_act,tanh_act,relu_act,loss,optimizer,memory_size,team,username,version
0,92.32%,92.31%,92.40%,92.32%,unknown,,,unknown,,,,,,,,,,,,,,,,,,,,,,rian,78
1,81.78%,81.78%,81.79%,81.78%,keras,,True,Sequential,6.0,963856.0,1.0,1.0,1.0,,1.0,,,,,,2.0,,1.0,,2.0,str,Adam,3856424.0,,francesyang,66
2,81.78%,81.78%,81.78%,81.78%,keras,,True,Sequential,3.0,2168577.0,1.0,,,,,1.0,,,,,1.0,1.0,,1.0,,function,Adam,8675184.0,,rian,76
3,80.57%,80.35%,82.02%,80.58%,keras,,True,Sequential,4.0,201154.0,1.0,,,,1.0,,,,,,2.0,,2.0,,,str,RMSprop,805360.0,,1jiahe,46
4,80.90%,80.89%,80.96%,80.90%,keras,,True,Sequential,4.0,640130.0,1.0,,,1.0,,,,,,1.0,1.0,,1.0,,,str,RMSprop,3111632.0,,chachagsedaro,26
5,80.57%,80.50%,81.06%,80.58%,keras,,True,Sequential,4.0,327266.0,1.0,1.0,,,,,,,1.0,,1.0,,1.0,,1.0,str,RMSprop,1309824.0,,mymstella,71
6,79.80%,79.63%,80.85%,79.81%,keras,,True,Sequential,5.0,193702.0,1.0,,,,1.0,2.0,,,,,1.0,,1.0,2.0,,str,RMSprop,776112.0,,amsay99,43
7,80.13%,80.13%,80.16%,80.13%,keras,,True,Sequential,4.0,640130.0,1.0,,,1.0,,,,,,1.0,1.0,,1.0,,,str,RMSprop,3111632.0,,chachagsedaro,25
8,79.58%,79.46%,80.34%,79.59%,keras,,True,Sequential,4.0,206850.0,1.0,,,,1.0,1.0,,,,,1.0,,1.0,1.0,,str,RMSprop,828272.0,,jer2240,51
9,79.25%,79.06%,80.41%,79.26%,keras,,True,Sequential,5.0,174658.0,1.0,,,,1.0,2.0,,,,,1.0,,1.0,2.0,,str,RMSprop,699936.0,7.0,lprockop,55
