In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report

from lightgbm import LGBMClassifier

import tensorflow as tf
import tensorflow_hub as hub

In [None]:
df = pd.read_csv('../input/disaster-tweets-cleaned/df.csv')
test_df = pd.read_csv('../input/disaster-tweets-cleaned/test_df.csv')
print(df.shape, test_df.shape)

In [None]:
def plot_sent_len(dataf, col, title):
    dataf['len_' + col] = dataf[col].apply(lambda txt: len(txt.split()))
    plt.hist(dataf['len_' + col], bins = 100)
    plt.title('Train sentences length')
    plt.show()
    return dataf

col = 'ctext'
df = plot_sent_len(df, col, 'sentence lengths')

In [None]:
train_txts, val_txts, y_train, y_val = train_test_split(
    df[col].values, df['target'].values,
    shuffle = True, test_size = 0.15,
    stratify = df['target'].values,
)
test_txts = test_df[col].values
y_test = test_df['target'].values
print('Train size:', train_txts.shape)
print('Validation size:', val_txts.shape)
print('Test size:', test_txts.shape)

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"

In [None]:
feature_ext = hub.KerasLayer(module_url, trainable = False)
print ("module %s loaded" % module_url)

In [None]:
train_X = feature_ext(train_txts)
val_X = feature_ext(val_txts)
test_X = feature_ext(test_txts)

In [None]:
train_X.shape

In [None]:
def plot_embedding(X, y):
    colors = ['green', 'red']
    labels = ['NoDisastor', 'Disastor']
    proj = PCA(2)
    proj_X = proj.fit_transform(X)
    for y_id in np.unique(y):
        plt.scatter(
            x = proj_X[y == y_id, 0], 
            y = proj_X[y == y_id, 1],
            s = 4,
            label = labels[y_id],
            c = colors[y_id], 
            alpha = 0.4
        )
        plt.xlabel('X1')
        plt.ylabel('X2')
        plt.legend()

def plot_embeddings(train_X, y_train, val_X, y_val, test_X ,y_test):
    fig = plt.figure(figsize = (12, 4))
    plt.subplot(1, 3 ,1 )
    plot_embedding(train_X, y_train)
    plt.title(f'Train dataset')
    
    plt.subplot(1, 3 ,2)
    plot_embedding(val_X, y_val)
    plt.title(f'Val dataset')
    
    plt.subplot(1, 3 ,3)
    plot_embedding(test_X, y_test)
    plt.title(f'Test dataset')
    plt.show()

In [None]:
plot_embeddings(train_X, y_train, val_X, y_val, test_X ,y_test)

In [None]:
def create_model():
    x_in = tf.keras.layers.Input(shape = (), dtype = tf.string)
    feature_ext = hub.KerasLayer(module_url, trainable = False, name = 'feat_ext')
    x_features = feature_ext(x_in)
    x = tf.keras.layers.Dropout(0.1)(x_features)
    x_out = tf.keras.layers.Dense(1, activation = 'sigmoid')(x)
    return tf.keras.Model(x_in, x_out)

In [None]:
optimizer = tf.keras.optimizers.Adam(0.005)
loss_objective = tf.keras.losses.BinaryCrossentropy()
model = create_model()
model.compile(loss = loss_objective, optimizer = optimizer, metrics = ['acc'])
model.fit(
    x = train_txts, y = y_train,
    validation_data = (val_txts, y_val),
    epochs = 5,
    batch_size = 32
)

In [None]:
y_test_hat = model.predict(test_txts)
y_model_hat = np.array([1 if x[0] >0.5 else 0 for x in y_test_hat])
print(classification_report(y_test, y_model_hat))

In [None]:
def train_gbm_cls(X_tr, y_tr, X_val, y_val, X_test, y_test):
    gbm_cls = LGBMClassifier(
        objective = 'binary',
    )
    gbm_cls.fit(
        X_tr, y_tr,
        eval_set = (X_val, y_val),
        early_stopping_rounds = 20,
        verbose = 0,
    )
    print('Train')
    print(classification_report(y_train, gbm_cls.predict(X_tr)))
    print('Validation')
    print(classification_report(y_val, gbm_cls.predict(X_val)))
    print('Test')
    gbm_y_hat = gbm_cls.predict(X_test)
    print(classification_report(y_test, gbm_y_hat))
    return gbm_cls, gbm_y_hat

gbm_cls, gbm_y_hat = train_gbm_cls(train_X, y_train, val_X, y_val, test_X ,y_test)