In [None]:
# update sklearn


from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding, BatchNormalization, Activation, Flatten
from keras.optimizers import SGD
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model
from keras.regularizers import l1, l2

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import class_weight
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.utils.multiclass import unique_labels


from nltk import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2 
from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm

import seaborn as sn
import nltk
import numpy as np
import pandas as pd
import os
import collections
import re
import string
import joblib
try: 
    import nlp
except Exception:
    from . import nlp
import codecs
import matplotlib.pyplot as plt

from numpy import argmax

import pickle

In [None]:
#data = pd.read_csv('MBTI_Clean_test.csv')
#data.head()

#nltk.download('averaged_perceptron_tagger')

In [None]:
def preprocess_data(data, tokenizer = None):
    print("Now preproccessing")
    print("Renaming columns")
    print(data.head())
    print("Data Dropped")

    tokens = nlp.split_words(data.posts)
    print("Data are tokens now")

    data['tokens'] = tokens

    print(data.head())
    sentence_lengths = [len(tokens) for tokens in data["tokens"]]
    print("Max sentence length is %s" % max(sentence_lengths))
    data['sentence_lengths'] = sentence_lengths
    print("about to lemmatize")
    lemmatized_posts = nlp.lemmatize(tokens)
    data['lemmatized_posts'] = lemmatized_posts
    print(data.head())
    if tokenizer == None:
        tokenizer = nlp.create_tokenizer(data)
    return data, tokenizer

In [None]:
def split_data(data):
    print("In spilt Data")
    y = data["binarized_target"].values
    print(y)
    try:
        data_train, data_test, y_train, y_test = train_test_split(data, y, test_size=0.10, random_state=500, stratify=y)
    except Exception as e:
        print(e)
    print("about to return")
    return data_train, data_test, y_train, y_test

In [None]:
def prepare_cnn_data(data, data_train, data_test, tokenizer):
    MAX_SEQUENCE_LENGTH = 1200
    train_cnn_data = nlp.tokenize(data_train, tokenizer, MAX_SEQUENCE_LENGTH)
    test_cnn_data = nlp.tokenize(data_test, tokenizer, MAX_SEQUENCE_LENGTH)
    train_word_index = tokenizer.word_index
    return train_cnn_data, test_cnn_data, train_word_index

In [None]:
def CNNModel(embedding_matrix, max_sequence_length, num_words, embedding_dim):
    inputs = Input(shape=(max_sequence_length, ))
    embedding_layer = Embedding(num_words,embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=True)(inputs)
    convs = []
    filter_sizes = [1,2,3,5]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=256, kernel_size=filter_size, activation='relu')(embedding_layer)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)

    l_merge = concatenate(convs, axis=1)
    y = Dense(128, activation='relu')(l_merge)
    y = Dropout(0.1)(y)  
    preds = Dense(1, activation='sigmoid')(l_merge)
    model = Model(inputs, preds)
    model.compile(loss='binary_crossentropy',
                 optimizer='adam',
                 metrics=['acc'])
    model.summary()
    return model

In [None]:
def prepareTrain(data):
    data , tokenizer = preprocess_data(data)
    embeddings_index = {}
    try:
#        f = codecs.open('./data/wiki-news-300d-1M.vec', encoding='utf-8')
        f = codecs.open('./prediction/system/data/wiki-news-300d-1M.vec', encoding='utf-8')
        print("opened the file")
    except Exception as e:
        print(e)
    for line in tqdm(f):
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('found %s word vectors' % len(embeddings_index))
    return data, embeddings_index, tokenizer

In [None]:
def trainModel(data, tokenizer, embeddings_index, character):
    MAX_SEQUENCE_LENGTH = 1200
    EMBEDDING_DIM = 300
    data = nlp.split_target_variable(data, character)
    print("split of targets done")
    data = nlp.binarize_target_variable(data)
    print("binarizetion of targets ")
    # data['sentence_lengths'].hist(bins = 30)
    undersampled_data = nlp.undersample(data)
    print("undersampled data is done")
    print(undersampled_data.head())
    X_train, X_test, y_train, y_test = split_data(undersampled_data)
    print("dataset Splitting is done")
    cnn_train, cnn_test, word_index = prepare_cnn_data(data, X_train, X_test, tokenizer)
    
    print('preparing embedding matrix...')
    words_not_found = []
    nb_words = min(999999, len(word_index) + 1)
    embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i >= nb_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if (embedding_vector is not None) and len(embedding_vector) > 0:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
        else:
            words_not_found.append(word)
    print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))    
    
    filepath='./prediction/system/data/weights' +'_'+ str(character) + '.best.hdf5'
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    callbacks_list = [checkpoint]
    
    model = CNNModel(embedding_matrix, MAX_SEQUENCE_LENGTH, len(word_index)+1, EMBEDDING_DIM)
    hist = model.fit(cnn_train, y_train, epochs=4, validation_data=(cnn_test, y_test), shuffle=True, callbacks=callbacks_list)
    filename = './prediction/system/data/model' +'_'+ str(character) + '.sav'
    joblib.dump(model, filename)

In [None]:
def trainAllModel(data):
    prepared_data, embeddings_index, tokenizer = prepareTrain(data)
    print("Data has been prepaired")
    trainModel(prepared_data, tokenizer, embeddings_index, 0)
    trainModel(prepared_data, tokenizer, embeddings_index, 1)
    trainModel(prepared_data, tokenizer, embeddings_index, 2)
    trainModel(prepared_data, tokenizer, embeddings_index, 3)

In [None]:
#trainAllModel(data)

In [None]:
#data.head()

In [None]:
def evaluateNewModel(data, character, model, tokenizer):
    MAX_SEQUENCE_LENGTH = 1200
    
    data, tokenizer = preprocess_data(data, tokenizer)
    data = nlp.split_target_variable(data, character)
    print("split of targets done")
    data = nlp.binarize_target_variable(data)
    print("binarizetion of targets ")
    y = data["binarized_target"].values
    X = nlp.tokenize(data, tokenizer, MAX_SEQUENCE_LENGTH)
    y_pred = model.predict(X, verbose=1)
    
    y_pred_bool = [1 * (x[0]>=0.5) for x in y_pred]


    return classification_report(y, y_pred_bool)

In [None]:
#evaluateModel(data, 0)

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
def prepare_tabular_data(data, data_train, data_test, y_train):
    tabular_train_data, tabular_test_data = nlp.tabular_features(data, data_train, data_test)
    tabular_train_data, tabular_test_data = nlp.tabular_scaler(tabular_train_data, tabular_test_data)
    tabular_train_best_data, tabular_test_best_data = nlp.chi2_features(tabular_train_data, tabular_test_data, y_train)
    return tabular_train_best_data, tabular_test_best_data

In [None]:
def hybrid_model(embedding_matrix, max_sequence_length, num_words, embedding_dim):
    
    inputA = Input(shape=(100,))
    inputB = Input(shape=(max_sequence_length, ))
    
    # Tabular data branch
    x = Dense(2, activation="relu")(inputA)
    x = Model(inputs=inputA, outputs=x)   
    
    
    # CNN data branch
    embedding_layer = Embedding(num_words,embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=True)(inputB)
    
    convs = []
    filter_sizes = [1,2,3,5]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=256, kernel_size=filter_size, activation='relu')(embedding_layer)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)

    l_merge = concatenate(convs, axis=1)
    y = Dense(128, activation='relu')(l_merge)
    y = Dropout(0.1)(y)  
    y = Dense(2, activation="relu")(y)
    y = Model(inputs=inputB, outputs=y)
    
    combined = concatenate([x.output, y.output])
    
    z = BatchNormalization()(combined)
    

    preds = Dense(1, activation='sigmoid')(z)

    model = Model(inputs = [x.input, y.input], outputs=preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [None]:
def testModel():
    inputA = Input(shape=(100,))    
    preds = Dense(1, activation="sigmoid")(inputA)
    model = Model(inputA, preds)
    sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    model.summary()
    return model