#### Description
Imbalanced and balanced data sampling and naive bayes classifier implementation.

All read file variables (like train.csv, embeddings files etc) are set to relative path, which means just drag the competition files in to the program's folder and it reads in them automatically. 
Furthermore, I turned off the support for AMD Radeon GPUs, turn it on if necessary (for quicker computing on Radeon machines).

In [1]:
# support for AMD Radeon GPU - if you run this on AMD Radeon GPU computer, then use it

# import plaidml.keras
# plaidml.keras.install_backend()
# import os
# os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"


# keras layers, tokenizer, model, sequential etc.
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, SimpleRNN, RNN, LSTM, GRU, Embedding, Dropout, Activation, Flatten, Conv1D, Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers.normalization import BatchNormalization
from keras import initializers, regularizers, constraints, optimizers, layers
import tensorflow as tf
# time for idle the system after deleting models and embedding to test in one notebook
import time
# linear algebra
import numpy as np 
# data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd 
# monitor loading time where it is supported
from tqdm import tqdm
import math

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import gc

random_seed = 63445
lsize = 128

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [26]:
# train_df = pd.read_csv("/Users/sneakysneak/Downloads/quora_dataset/train.csv")
# test_df = pd.read_csv("/Users/sneakysneak/Downloads/quora_dataset/test.csv")]
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1306122 entries, 0 to 1306121
Data columns (total 3 columns):
qid              1306122 non-null object
question_text    1306122 non-null object
target           1306122 non-null int64
dtypes: int64(1), object(2)
memory usage: 29.9+ MB


In [4]:
# Split the training dataset into train and val sample. 
# Cross validation is a time consuming process and so 
# let us do simple train val split.

## split to train and val
# train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)
# no random state
train_df, val_df = train_test_split(train_df, test_size=0.1)

## configuration values 
embed_size = 300 # the size of each word vector
max_features = 50000 # the size of unique words in use - the number of rows in the embedding vector
maxlen = 100 # the size of the number of words in each question

## first fill all missing values up
train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

## tokenize with Keras
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)

test_X = tokenizer.texts_to_sequences(test_X)

## sentence padding
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## save the target values as train_y and val_y
train_y = train_df['target'].values
val_y = val_df['target'].values

In [5]:
print(train_y), print(val_y), print(train_X), print(val_X)

[0 0 1 ... 0 0 0]
[0 0 0 ... 0 0 0]
[[    0     0     0 ...   181 22659 11952]
 [    0     0     0 ...   429     1   281]
 [    0     0     0 ...     1   597    94]
 ...
 [    0     0     0 ...     6    63   643]
 [    0     0     0 ...  3864   387    17]
 [    0     0     0 ... 21471   296  1032]]
[[    0     0     0 ...  2477     6   656]
 [    0     0     0 ...    29   486   167]
 [    0     0     0 ...   626     7 28257]
 ...
 [    0     0     0 ...    44   554   341]
 [    0     0     0 ...     6   312   829]
 [    0     0     0 ...     6 11215  1311]]


(None, None, None, None)

In [6]:
train_y.dtype, val_y.dtype, train_X.dtype, val_X.dtype

(dtype('int64'), dtype('int64'), dtype('int32'), dtype('int32'))

In [7]:
train_y.shape, train_X.shape, val_y.shape, val_X.shape

((1175509,), (1175509, 100), (130613,), (130613, 100))

In [8]:
def my_metrics(true, preds):
    """
    Function to calculate evaluation metrics 
    parameters: true values, predictions
    prints accuracy, recall, precision and f1 scores
    """
    accuracy = accuracy_score(true, preds)
    precision = precision_score(true, preds)
    recall = recall_score(true, preds)
    f1score = f1_score(true, preds)
    print ('accuracy: {}, precision: {}, recall: {}, f1-score: {}'.format(accuracy, recall, precision, f1score))

In [9]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB().fit(train_X, train_y)

In [10]:
# confusion matrix?? and after metrics
preds = cross_val_predict(gnb, train_X, train_y, cv=5)

In [11]:
from sklearn.metrics import confusion_matrix
confusion_matrix(train_y, preds)

array([[1087811,   15012],
       [  68637,    4049]])

In [12]:
cross_val_score(gnb, train_X, train_y, cv=3, scoring="accuracy")

array([0.92938135, 0.92612981, 0.93007789])

In [13]:
my_metrics(train_y, preds)

accuracy: 0.9288401875272754, precision: 0.05570536279338525, recall: 0.2124232726509627, f1-score: 0.08826446641307072


In [14]:
# mer ugye a sima metricsnel is defineolni kellett h 0 v 1 alapjan nyomja, volt hh naon jo ertek, 
# volt h naon dzar, hat ezert ilyen, 2 class kell

from sklearn.metrics import classification_report
target_names = ['class 0', 'class 1']
print(classification_report(train_y, preds, labels=[0,1], target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.94      0.99      0.96   1102823
     class 1       0.21      0.06      0.09     72686

    accuracy                           0.93   1175509
   macro avg       0.58      0.52      0.53   1175509
weighted avg       0.90      0.93      0.91   1175509



In [15]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=random_seed)
X_, Y_ = sm.fit_sample(train_X, train_y, )
print ('Shape of oversampled data: {}'.format(X_.shape))
print ('Shape of Y: {}'.format(Y_.shape))

Shape of oversampled data: (2205646, 100)
Shape of Y: (2205646,)


In [16]:
X_converted = np.floor(X_).astype(int)

from sklearn.model_selection import cross_val_predict
y_train_pred_balanced = cross_val_predict(gnb, X_converted, Y_, cv=5)

In [17]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_, y_train_pred_balanced)

array([[1088235,   14588],
       [1056210,   46613]])

In [18]:
my_metrics(Y_,y_train_pred_balanced)

accuracy: 0.5145195557219971, precision: 0.04226698209957536, recall: 0.7616378817339586, f1-score: 0.08008941396397325


In [19]:
# mer ugye a sima metricsnel is defineolni kellett h 0 v 1 alapjan nyomja, volt hh naon jo ertek, 
# volt h naon dzar, hat ezert ilyen, 2 class kell

from sklearn.metrics import classification_report
target_names = ['class 0', 'class 1']
print(classification_report(Y_, y_train_pred_balanced, labels=[0,1], target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.51      0.99      0.67   1102823
     class 1       0.76      0.04      0.08   1102823

    accuracy                           0.51   2205646
   macro avg       0.63      0.51      0.38   2205646
weighted avg       0.63      0.51      0.38   2205646



In [20]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(train_y, preds, average='macro')

(0.5765358532379582, 0.5210465121383439, 0.525619845803696, None)

In [21]:
precision_recall_fscore_support(train_y, preds, average='micro')

(0.9288401875272754, 0.9288401875272754, 0.9288401875272754, None)

In [22]:
precision_recall_fscore_support(train_y, preds, average='weighted')

(0.8956196215699282, 0.9288401875272754, 0.9088886752718844, None)

In [23]:
precision_recall_fscore_support(Y_, y_train_pred_balanced, average='macro')

(0.6345524243557141, 0.5145195557219971, 0.3751679552017209, None)

In [24]:
precision_recall_fscore_support(Y_, y_train_pred_balanced, average='micro')

(0.5145195557219971, 0.5145195557219971, 0.5145195557219971, None)

In [25]:
precision_recall_fscore_support(Y_, y_train_pred_balanced, average='weighted')

(0.6345524243557141, 0.5145195557219971, 0.37516795520172086, None)