# Solutions to 03SimpleNLPModels

## Preamble: Execute this if checking any answer!

In [None]:
# allows update of external libraries without need to reload package
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.model_selection
import sklearn.preprocessing
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
import keras.preprocessing.text
from keras.preprocessing import sequence
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
import xarray

%matplotlib inline

import sys

sys.path.append("../scripts")
import normalize_text_bootcamp
import utils_bootcamp
import plotting

In [None]:
FOLDER_DATA = "/p/scratch/deepacf/maelstrom/maelstrom_data/ap2/data/tweets/"
FOLDER_DATA = "/p/project/training2223/a2/data/tweets/"

In [None]:
FOLDER_TWEET = FOLDER_DATA + "tweets_2017_normalized_filtered.nc"
# define the variable name of the total precipitation in the dataset
key_tp = "tp"
ds = xarray.load_dataset(FOLDER_TWEET)

In [None]:
# ds_norm = normalize_text_bootcamp.normalize_filter_dataset(
#     ds,
#     keywords=None,
#     reset_index=True,
#     key_text_original="text_original",
#     key_text_normalized="text_normalized",
#     key_text_backup=None,
#     ignore_non_ascii=True,
#     replace_keyword_emojis=True,
#     remove_punctuations="keep_basic_punctuations",
#     reduce_punctuations=True,
#     use_lower_case=True,
#     do_split_punctutation_text=False,
#     remove_sun_confusing_terms=True,
#     only_text_containing_keywords=True,
#     maximum_bounding_box_area=100,
# )

In [None]:
ds["raining"] = (["index"], ds[key_tp].values > 1e-8)

In [None]:
X = ds.text_normalized.values
Y = ds.raining.values

# Tasks 2:

In [None]:
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(
    X, Y, test_size=0.2, stratify=ds.raining.values
)

### Encode our labels 
This step is required if our label's are in text format (e.g., "cat", "dog"). This is not the case here, but it's included for completion. The format of encoded labels depends on the model, so you should check them as well.

In [None]:
le = sklearn.preprocessing.LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1, 1)

In [None]:
X

In [None]:
Y

In [None]:
max_words = 1000
max_len = 150
tok = keras.preprocessing.text.Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = pad_sequences(sequences, maxlen=max_len)

In [None]:
def RNN():
    inputs = Input(name="inputs", shape=[max_len])
    layer = Embedding(max_words, 50, input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256, name="FC1")(layer)
    layer = Activation("relu")(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1, name="out_layer")(layer)
    layer = Activation("sigmoid")(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model

In [None]:
model = RNN()
model.summary()
model.compile(loss="binary_crossentropy", optimizer=RMSprop(), metrics=["accuracy"])

In [None]:
model.fit(
    sequences_matrix,
    Y_train,
    batch_size=128,
    epochs=10,
    validation_split=0.2,
    callbacks=[EarlyStopping(monitor="val_loss", min_delta=0.0001)],
)

In [None]:
# build test dataset
sequences_test = tok.texts_to_sequences(X_test)
sequences_matrix_test = pad_sequences(sequences_test, maxlen=max_len)

In [None]:
y_predict = model.predict(sequences_matrix_test)

In [None]:
plotting.analysis.classification_report(y_predict.argmax(-1), Y_test)

In [None]:
plotting.analysis.plot_roc(Y_test, y_predict)

In [None]:
Y_predict = model.predict(sequences_matrix_test)

In [None]:
Y_predict

In [None]:
plotting.analysis.classification_report(Y_predict.argmax(-1), Y_test)

In [None]:
plotting.analysis.plot_roc(Y_test, Y_predict)