In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
import cupy
import spacy
from sklearn.preprocessing import StandardScaler
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading the data

In [None]:
main_path = '/kaggle/input/nlp-getting-started/'
train = pd.read_csv(main_path + 'train.csv', index_col = 'id')
test =  pd.read_csv(main_path + 'test.csv', index_col = 'id')

In [None]:
# See the train data
train.head()

In [None]:
# view the test data
test.head()

In [None]:
train['keyword'].isna().sum(), test['keyword'].isna().sum()

In [None]:
train['keyword'].value_counts()

In [None]:
# As a test, I'll fill the NA for the keyword column with 'neutral'
train['keyword'].fillna('neutral', inplace = True)
test['keyword'].fillna('neutral', inplace = True)

In [None]:
# I wont use the location column so I will drop it
train.drop(['location'], inplace = True, axis =1)
test.drop(['location'], inplace = True, axis =1)

In [None]:
test.head()

In [None]:
# Data without the other columns
train.head()

## Train-Test Split

In [None]:
# Setup X and Y
X = train[['keyword', 'text']]
y = train.target.values

# Train test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 11)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Keras model with word embeddings

In [None]:
# Need to load the large model to get the vectors
nlp = spacy.load('en_core_web_lg')

### Function for making the embeddings

In [None]:
def make_embedding(texts):

    with nlp.disable_pipes():
        embeddings = [cupy.asnumpy(nlp(text).vector) for text in texts]
        embeddings = np.array(embeddings)
        
        return embeddings

In [None]:
# make the embeddings on the train data and process it
def process_train(train):
    # Make the word embedding 
    train = make_embedding(train)
    
    # Center the vectors by substracting the mean 
    mean = train.mean(axis = 0)
    train = train - mean
    
    # Scaling the data
    scaler = StandardScaler()
    scaler.fit(train)
    train = scaler.transform(train)
    
    return scaler,mean, train

In [None]:
# Processing the texts from the train data
txt_scaler, txt_mean, train_txt= process_train(X_train.text.values)
# Processing the keywords from the test data
kw_scaler, kw_mean, train_kw = process_train(X_train.keyword.values)

# Join the keywordws and the texts, each element will be an array of [text, keyword]
train_data = np.array(list(zip(train_txt,train_kw)))
train_data.shape

In [None]:
def process_texts( scaler, texts, mean):
    # Create the text embedding
    texts = make_embedding(texts)
    
    # Center the vectors by substracting the mean of the train data
    texts = texts - mean
    
    # Scale the data with sklearn's standardScaler fitted on the train data
    texts = scaler.transform(texts)
    
    return texts

In [None]:
X_test.head()

In [None]:
# The same as for the train data but with the validation data
val_txt = process_texts( txt_scaler, X_test.text.values, txt_mean)
val_kw = process_texts( kw_scaler, X_test.keyword.values, kw_mean)

val_data = np.array(list(zip(val_txt,val_kw)))
val_data.shape

In [None]:
# Making the classifier keras model
keras_clf = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(2, 300)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.6),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(2, activation="softmax")
])

keras_clf.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
              metrics=['accuracy'])

In [None]:
tf.random.set_seed(14)

# Fitting the keras model
keras_clf.fit(train_data, y_train, epochs=29, validation_split = 0.1)

## Function for predicting with the keras model and get the predicted label

In [None]:
def keras_model_predict(texts):
    preds = keras_clf.predict(texts)
    preds = [a.argmax() for a in preds]
    
    return preds

## Evaluating the keras model on train data

In [None]:
keras_train_preds = keras_model_predict(train_data)
print(classification_report(y_train, keras_train_preds, target_names=['not_disaster', 'disaster']))

## Evaluating the keras model on validation data

In [None]:
keras_val_preds = keras_model_predict(val_data)
print(classification_report(y_test, keras_val_preds, target_names=['not_disaster', 'disaster']))

In [None]:
# Making the test data
test_txt = process_texts( txt_scaler, test.text.values, txt_mean)
test_kw = process_texts( kw_scaler, test.keyword.values, kw_mean)

test_data = np.array(list(zip(test_txt,test_kw)))
test_data.shape

In [None]:
# predicting on the test data
keras_test_preds = keras_model_predict(test_data)
keras_test_preds[:20]

In [None]:
keras_submission = pd.DataFrame({
    'id': test.index,
    'target': keras_test_preds
})
keras_submission

In [None]:
keras_submission.to_csv('submission.csv', index = False)