In [108]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import joblib
import keras





In [152]:

#load clean data
full_train_df = pd.read_csv("data/cleaned_train.csv")
full_train_df.dropna(inplace=True)


print("Train Data shape :", full_train_df.shape)
full_train_df.head()

Train Data shape : (7610, 3)


Unnamed: 0,id,text_cleaned,target
0,1,deeds reason may allah forgive us,1
1,4,forest fire near la ronge sask canada,1
2,5,residents asked shelter place notified officer...,1
3,6,people receive evacuation orders california,1
4,7,got sent photo ruby smoke pours school,1


In [153]:
full_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7610 entries, 0 to 7612
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            7610 non-null   int64 
 1   text_cleaned  7610 non-null   object
 2   target        7610 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 237.8+ KB


### Splitting data

In [154]:
shuffled_train_df = full_train_df.sample(frac=1,
                                    random_state=42,
                                    replace=False)



train_data_split_df = shuffled_train_df[ : 6000]
val_data_split_df = shuffled_train_df[6000 : 6800]
test_data_split_df = shuffled_train_df[6800: ]

train_split_ids = train_data_split_df["id"]
val_split_ids = val_data_split_df["id"]
test_split_ids = val_data_split_df["id"]


print("Train  split shape: ", train_data_split_df.shape)
print("Validation split shape: ", val_data_split_df.shape)
print("Test  split shape: ", test_data_split_df.shape)

Train  split shape:  (6000, 3)
Validation split shape:  (800, 3)
Test  split shape:  (810, 3)


### Establish Baseline: TF-iDF + Logistic Regression

In [155]:
TF_MAX_FEATURES = 2000
TF_MAX_DF=0.93
TF_MIN_DF=0.00001


In [156]:

tfidf_vectorizer = TfidfVectorizer(
                             max_features=TF_MAX_FEATURES,
                             max_df=TF_MAX_DF,
                             min_df=TF_MIN_DF,
                             )


In [157]:
tfidf_vectorizer.fit(train_data_split_df["text_cleaned"])

In [158]:
train_tfidf_matrix  = tfidf_vectorizer.transform(train_data_split_df["text_cleaned"]).toarray()
val_tfidf_matrix = tfidf_vectorizer.transform(val_data_split_df["text_cleaned"]).toarray()
test_tfidf_matrix = tfidf_vectorizer.transform(test_data_split_df["text_cleaned"]).toarray()

In [159]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score



logreg = LogisticRegression(random_state=42)

logreg.fit(train_tfidf_matrix, train_data_split_df["target"])

In [160]:

def evaluate_model(model, train_data, val_data, test_data):
    # Predictions
    train_preds = np.round(model.predict(train_data[0]))
    val_preds = np.round(model.predict(val_data[0]))
    test_preds = np.round(model.predict(test_data[0]))
    
    # F1 scores
    train_f1 = f1_score(train_data[1], train_preds)
    val_f1 = f1_score(val_data[1], val_preds)
    test_f1 = f1_score(test_data[1], test_preds)
    
    # Accuracy scores
    train_acc = accuracy_score(train_data[1], train_preds)
    val_acc = accuracy_score(val_data[1], val_preds)
    test_acc = accuracy_score(test_data[1], test_preds)
    
    # Print results
    print("MODEL: ", str(model))
    print("\n======ACCURACY==========\n")
    print("Train Accuracy Score: ", train_acc)
    print("Validation Accuracy Score: ", val_acc)
    print("Test Accuracy  Score: ", test_acc)

    print("\n======F1-Score==========\n")
    print("Train F1 Score: ", train_f1)
    print("Validation F1 Score: ", val_f1)
    print("Test F1 Score: ", test_f1)


evaluate_model(logreg, 
               (train_tfidf_matrix, train_data_split_df["target"]), 
               (val_tfidf_matrix, val_data_split_df["target"]), 
               (test_tfidf_matrix, test_data_split_df["target"]))


MODEL:  LogisticRegression(random_state=42)


Train Accuracy Score:  0.8558333333333333
Validation Accuracy Score:  0.79125
Test Accuracy  Score:  0.7864197530864198


Train F1 Score:  0.814735489398158
Validation F1 Score:  0.7344992050874405
Test F1 Score:  0.7390648567119157


### Saving baseline pipeline

In [161]:
baseline_pipeline = Pipeline([
    ('TF-IDF', tfidf_vectorizer),
    ('Logistic Regression', logreg)
])


joblib.dump(baseline_pipeline, "assets/tfidf_logreg_baseline.joblib")



['assets/tfidf_logreg_baseline.joblib']

### Baseline 2: TF-iDf + Simple Neural Net

In [162]:

baseline_nn_model = tf.keras.Sequential([
        
        tf.keras.layers.Input(shape=(TF_MAX_FEATURES,)),
         tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(1, activation="sigmoid"),
       
], name="baseline_shallow_neuralnet")


baseline_nn_model.summary()

Model: "baseline_shallow_neuralnet"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dropout_26 (Dropout)        (None, 2000)              0         
                                                                 
 dense_39 (Dense)            (None, 32)                64032     
                                                                 
 dropout_27 (Dropout)        (None, 32)                0         
                                                                 
 dense_40 (Dense)            (None, 16)                528       
                                                                 
 dense_41 (Dense)            (None, 1)                 17        
                                                                 
Total params: 64577 (252.25 KB)
Trainable params: 64577 (252.25 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [163]:



baseline_nn_model.compile(optimizer=tf.keras.optimizers.Adam(0.0003), 
                          metrics=["accuracy"],
                          loss="binary_crossentropy")

baseline_nn_history = baseline_nn_model.fit(train_tfidf_matrix, train_data_split_df["target"].values.reshape(-1, 1),
                       batch_size=256,
                       epochs=30,
                       validation_data=(val_tfidf_matrix, val_data_split_df["target"].values.reshape(-1, 1)),
                      )

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [164]:

evaluate_model(baseline_nn_model, 
               (train_tfidf_matrix, train_data_split_df["target"]), 
               (val_tfidf_matrix, val_data_split_df["target"]), 
               (test_tfidf_matrix, test_data_split_df["target"]))


MODEL:  <keras.src.engine.sequential.Sequential object at 0x7f24a81742e0>


Train Accuracy Score:  0.8536666666666667
Validation Accuracy Score:  0.785
Test Accuracy  Score:  0.7851851851851852


Train F1 Score:  0.8098744045041144
Validation F1 Score:  0.7234726688102894
Test F1 Score:  0.7347560975609757


In [165]:
baseline_nn_model.save("assets/baseline_nn_model.keras")

### Training LSTM 

#### TODO:
     1) train LSTM/GRU models for a while to achive slightly better performance
     2) Train distilBERT a few times and try different versions of BERT
     3) Save results
     4) Merge it all in one notebook!
