# About this Notebook
This notebook creates a Keras Sequential ANN that accompanies the other notebook (49 Years of Music - Collection and Analysis). Run this first. You'll need to collect the data from the link in the other notebook. 
This is a standard Bag of Words (BoW) model approach

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import HTML, display
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300
from datetime import datetime
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import KFold
import tabulate
import spacy
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.models import model_from_json
import json
import pickle

# Functions

In [None]:
def add_spacy_data(dataset, feature_column):
        '''
    Grabs the verb, adverb, noun, and stop word Parts of Speech (POS) 
    tokens and pushes them into a new dataset. returns an 
    enriched dataset
    '''
    verbs = []
    nouns = []
    adverbs = []
    corpus = []
    nlp = spacy.load('en_core_web_sm')
    ##
    for i in range (0, len(dataset)):
        print("Extracting verbs and topics from record {} of {}".format(i+1, len(dataset)), end = "\r")
        song = dataset.iloc[i][feature_column]
        doc = nlp(song)
        spacy_dataframe = pd.DataFrame()
        for token in doc:
            if token.lemma_ == "-PRON-":
                    lemma = token.text
            else:
                lemma = token.lemma_
            row = {
                "Word": token.text,
                "Lemma": lemma,
                "PoS": token.pos_,
                "Stop Word": token.is_stop
            }
            spacy_dataframe = spacy_dataframe.append(row, ignore_index = True)
        verbs.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "VERB"].values))
        nouns.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "NOUN"].values))
        adverbs.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "ADV"].values))
        corpus_clean = " ".join(spacy_dataframe["Lemma"][spacy_dataframe["Stop Word"] == False].values)
        corpus_clean = re.sub(r'[^A-Za-z0-9]+', ' ', corpus_clean)   
        corpus.append(corpus_clean)
    dataset['Verbs'] = verbs
    dataset['Nouns'] = nouns
    dataset['Adverbs'] = adverbs
    dataset['Corpus'] = corpus
    return dataset

In [None]:
def prep_corpus(raw_string):
    '''Single use of add_spacy_data to enable pipelining 
    data into predictions'''
    verbs = []
    nouns = []
    adverbs = []
    corpus = []
    nlp = spacy.load('en_core_web_sm')

    doc = nlp(raw_string)
    spacy_dataframe = pd.DataFrame()
    for token in doc:
        if token.lemma_ == "-PRON-":
                lemma = token.text
        else:
            lemma = token.lemma_
        row = {
            "Word": token.text,
            "Lemma": lemma,
            "PoS": token.pos_,
            "Stop Word": token.is_stop
        }
        spacy_dataframe = spacy_dataframe.append(row, ignore_index = True)
    verbs.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "VERB"].values))
    nouns.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "NOUN"].values))
    adverbs.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "ADV"].values))
    corpus_clean = " ".join(spacy_dataframe["Lemma"][spacy_dataframe["Stop Word"] == False].values)
    corpus_clean = re.sub(r'[^A-Za-z0-9]+', ' ', corpus_clean)   

    return corpus_clean

# Loading and prepping the training data

In [None]:
training_data = pd.read_json("Dataset for Detection of Cyber-Trolls.json", lines = True)

In [None]:
labels = []
for i in range(0, len(training_data)):
    labels.append("".join(training_data.iloc[i]['annotation']['label']))

In [None]:
training_data['label'] = labels

In [None]:
training_data.head()

In [None]:
prepped_training_data = add_spacy_data(training_data, 'content')

In [None]:
prepped_training_data.to_json('prepped_aggression_data.json')

In [None]:
prepped_training_data = pd.read_json('prepped_aggression_data.json')

In [None]:
prepped_training_data.head()

## Vectorizing and Splitting the Data

In [None]:
cv = CountVectorizer(max_features = 250)
X = cv.fit_transform(prepped_training_data['Corpus']).toarray()
y = prepped_training_data['label'].values
print("Done!")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 0)

In [None]:
train_data = keras.preprocessing.sequence.pad_sequences(X_train, maxlen=250)
test_data = keras.preprocessing.sequence.pad_sequences(X_test, maxlen=250)

## Building the Keras Model

In [None]:
model = keras.Sequential()
#model.add(keras.layers.Embedding(128, 64))
model.add(Dense(128, input_shape=(250,)))
model.add(Dropout(0.3))
#model.add(keras.layers.GlobalAveragePooling1D())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(250, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(250, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(250, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='relu'))
model.summary()

In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

## Training

In [None]:
history = model.fit(train_data,
                    y_train,
                    epochs=200,
                    batch_size=512,
                    verbose=2,
                   validation_split=0.1)

In [None]:
results = model.evaluate(test_data, y_test,verbose=1)
results

 ## Testing

In [None]:
test_text = "I hate you"
test_text = prep_corpus(test_text)

In [None]:
test_cv = cv.transform([test_text]).toarray()

In [None]:
display(model.predict([test_cv]))
model.predict_classes([test_cv])

## Saving the Model

In [None]:
model_json = model.to_json()
with open("82pct_aggression_detectioon_keras_model.json", "w") as json_file:
    json_file.write(model_json)
model.save_weights("82pct_aggression_detectioon_keras_weights.h5")

## Saving the fitted Vectorizer

In [None]:
pickle.dump(cv, open("cv.pickle", "wb"))