#### Description
Straatified data sampling and multi-layer perceptron classifier implementation.

All read file variables (like train.csv, embeddings files etc) are set to relative path, which means just drag the competition files in to the program's folder and it reads in them automatically. 
Furthermore, I turned off the support for AMD Radeon GPUs, turn it on if necessary (for quicker computing on Radeon machines).

In [1]:
# support for AMD Radeon GPU - if you run this on AMD Radeon GPU computer, then use it

# import plaidml.keras
# plaidml.keras.install_backend()
# import os
# os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"


# keras layers, tokenizer, model, sequential etc.
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, SimpleRNN, RNN, LSTM, GRU, Embedding, Dropout, Activation, Flatten, Conv1D, Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers.normalization import BatchNormalization
from keras import initializers, regularizers, constraints, optimizers, layers
import tensorflow as tf
# time for idle the system after deleting models and embedding to test in one notebook
import time
# linear algebra
import numpy as np 
# data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd 
# monitor loading time where it is supported
from tqdm import tqdm
import math

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import gc

random_seed = 63445
lsize = 128

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
# train_df = pd.read_csv("/Users/sneakysneak/Downloads/quora_dataset/train.csv")
# test_df = pd.read_csv("/Users/sneakysneak/Downloads/quora_dataset/test.csv")]
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


In [None]:
# Split the training dataset into train and val sample. 
# Cross validation is a time consuming process and so 
# let us do simple train val split.

## split to train and val
# train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['target'], random_state=0)
# no random_state
train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['target'])

# skf = cross_validation.StratifiedKFold(y, n_folds=2) #2-fold cross validation
# len(skf)
# for train_index, test_index in skf:
#     print("TRAIN:", train_index, "TEST:", test_index)
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]
# #fit and predict with X_train/test. Use accuracy metrics to check validation performance



## configuration values 
embed_size = 300 # the size of each word vector
max_features = 50000 # the size of unique words in use - the number of rows in the embedding vector
maxlen = 100 # the size of the number of words in each question

## first fill all missing values up
train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

## tokenize with Keras
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)

test_X = tokenizer.texts_to_sequences(test_X)

## sentence padding
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## save the target values as train_y and val_y
train_y = train_df['target'].values
val_y = val_df['target'].values

In [None]:
def my_metrics(true, preds):
    """
    Function to calculate evaluation metrics 
    parameters: true values, predictions
    prints accuracy, recall, precision and f1 scores
    """
    accuracy = accuracy_score(true, preds)
    precision = precision_score(true, preds)
    recall = recall_score(true, preds)
    f1score = f1_score(true, preds)
    print ('accuracy: {}, precision: {}, recall: {}, f1-score: {}'.format(accuracy, recall, precision, f1score))

In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(5, 2), random_state=1)

clf.fit(train_X, train_y)

In [None]:
# confusion matrix?? and after metrics
preds = cross_val_predict(clf, train_X, train_y, cv=5)

In [None]:
my_metrics(train_y, preds)