# Neural Network - multiclass classification

## Read the data

In [None]:
import pandas as pd

df_train = pd.read_json("../data/news_train.json", orient="records")
df_train.info()
df_train.hist()
df_train.head()

In [None]:
df_val = pd.read_json("../data/news_val.json", orient="records")
df_val.info()
df_val.hist()
df_val.head()

In [None]:
df_test = pd.read_json("../data/news_test.json", orient="records")
df_test.info()
df_test.hist()
df_test.head()

## Preprocessing data

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

STOPWORDS = stopwords.words("english")

def is_html_tag(word):
    w = word.replace("\n", "")
    return w.startswith("<") or w.startswith(">") or w.startswith("/") or w.strip()[:2] == "br"

def remove_html_tags(sentence):
    single_spaces = " ".join(sentence.split())

    return " ".join([token for token in single_spaces.split(" ") if not is_html_tag(token)])

def tokenize_sentence(sentence):
    return word_tokenize(sentence.lower())

def remove_stopwords(sentence):
    return " ".join([word for word in sentence.split(" ") if not word in STOPWORDS])

def remove_punctuation(tokenized_sentence):
    return [word for word in tokenized_sentence if word.isalpha()]

def preprocess(sentence):
    return remove_punctuation(tokenize_sentence(remove_stopwords(remove_html_tags(str(sentence)))))

print(preprocess(df_train["text"].values[1]))


In [None]:
df_train["text"] = df_train["text"].apply(preprocess)
df_train.head()

In [None]:
def build_vocab(tokenized_input, vocab_size):
    d = dict()

    for tokens in tokenized_input:
        for token in tokens:
            # double check
            if token not in STOPWORDS and token.isalpha():
                d[token] = d.get(token, 0) + 1

    del d["br"]

    return {k for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)[:vocab_size]}

In [None]:
VOCAB_SIZE = 10000
VOCAB = build_vocab(df_train["text"].values, VOCAB_SIZE) # (!) Always build the vocabulary on the TRAIN dataset

In [None]:
len(VOCAB)

In [None]:
LABELS = df_train["label"].unique()
LABELS

In [None]:
LABELS_TO_NAMES = {label: df_train[df_train["label"] == label].iloc[0]["label_name"] for label in LABELS}
LABELS_TO_NAMES

In [None]:
def get_frequencies_for_labels(df):
    dict_freqs = {label: {} for label in LABELS}

    for idx in range(df.shape[0]):
        tokens = df.iloc[idx, 0]
        label = df.iloc[idx, 1]
        
        for token in tokens:
            if token in VOCAB:
                dict_freqs[label][token] = dict_freqs[label].get(token, 0) + 1
            

    return dict_freqs

In [None]:
frequency_table = get_frequencies_for_labels(df_train)
frequency_table[0]["money"]

In [None]:
import numpy as np

def extract_features(frequency_table, tweet_tokens):
    label_frequencies = {label: 0 for label in LABELS}

    for t in tweet_tokens:
        for label in LABELS:
            label_frequencies[label] += frequency_table[label].get(t, 0)
    
    return pd.Series(label_frequencies)

In [None]:
X_train = df_train["text"].apply(lambda tokens: extract_features(frequency_table, tokens))
X_train

In [None]:
y_train = df_train["label"].values
y_train

In [None]:
X_val = df_val["text"].apply(preprocess)
X_val = df_val["text"].apply(lambda tokens: extract_features(frequency_table, tokens))
y_val = df_val["label"].values
X_val

## Create model and train / validate

In [None]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(4,)),
    tf.keras.layers.Dense(128),
    tf.keras.layers.Dense(64),
    tf.keras.layers.Dense(units=4, activation='softmax')
])

model.compile(optimizer='adam',
              loss="sparse_categorical_crossentropy",
              metrics=['accuracy'])
              
model.summary()

In [None]:
h = model.fit(X_train,
              y_train,
              epochs=10,
              batch_size=32,
              validation_data=(X_val, y_val))

In [None]:
import matplotlib.pyplot as plt

plt.plot(h.history['loss'], label='(training data)',color='blue')
plt.plot(h.history['val_loss'], label='(validation data)',color='green')
plt.title('Neural Network training loss')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(h.history['accuracy'], label='(training data)',color='blue')
plt.plot(h.history['val_accuracy'], label='(validation data)',color='green')
plt.title('Neural Network training accuracy')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

## Evaluate the trained model

In [None]:
X_test = df_test["text"].apply(preprocess)
X_test = df_test["text"].apply(lambda tokens: extract_features(frequency_table, tokens))
y_test = df_test["label"].values
X_test

In [None]:
results = model.evaluate(X_test, y_test, batch_size=50)
print("test loss, test acc:", results)

In [None]:
df_test_raw = pd.read_json("../data/news_test.json", orient="records")

In [None]:
predictions = np.round(model.predict(X_test[:100]))
for i in range(len(predictions)):
    print(df_test_raw["text"].values[i][:100],"...")
    print("Pred: ", predictions[i], "Real: ", y_test[i])