In [None]:
import os
import csv

import numpy as np
import pandas as pd

# import xgboost
from sklearn.externals import joblib
from sklearn.utils import compute_class_weight
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from keras import Sequential
from keras import optimizers
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Dropout, Dense
from keras.models import load_model

Load cleaned data from pickle file

In [None]:
cleaned_df = joblib.load('data/cleaned_df.pkl')

Load top 50 features and extract top 50 features into a list

In [None]:
top_50_feats = pd.read_csv('top50rfinit.csv')
top_50_feats_list = top_50_feats['variable'].tolist()[:20]
top_50_feats_list.append('device_category')

In [None]:
filtered_df = cleaned_df.filter(items=top_50_feats_list, axis='columns')

In [None]:
filtered_df.head()

Encode x and y into numpy vectors for neural network

In [None]:
x = filtered_df.drop('device_category', axis=1)
y = filtered_df['device_category']

encoded_x = np.array(x)
label_bin = LabelBinarizer()
label_bin.fit(np.array(y))
encoded_y = label_bin.transform(y)
encoded_y = np.array(encoded_y)

weights = compute_class_weight('balanced', np.unique(y), y)

Neural Network Model Functions

In [None]:
def build_nn_model(x, y):
    model = Sequential()

    model.add(Dense(100, activation='relu', input_shape=tuple(x.shape[1:])))
    model.add(Dropout(0.20))
#     model.add(Dense(100, activation='relu'))
#     model.add(Dropout(0.25))
#     model.add(Dense(100, activation='relu'))
#     model.add(Dropout(0.25))
    model.add(Dense(y.shape[1], activation='softmax'))


    # op = optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True)
    op = optimizers.Adam(lr=0.005)

    model.compile(optimizer=op, metrics=['categorical_accuracy'], loss='categorical_crossentropy')
    return model


def train_nn_model(x, y, model, model_name, weights):

    save_checkpoint = ModelCheckpoint(model_name, save_best_only=True, verbose=1)
    early_stop = EarlyStopping(min_delta=0.01, patience=200, verbose=1, mode='min')
    try:
        model.fit(x, y, batch_size=4000, epochs=100, verbose=2, class_weight=weights,
                  callbacks=[save_checkpoint, early_stop], validation_split=0.2, shuffle=True)
    except Exception as e:
        print(e)

In [None]:
nn_model = build_nn_model(encoded_x, encoded_y)

In [None]:
train_nn_model(encoded_x, encoded_y, nn_model, model_name='top50_nn_model.h5', weights=weights)

In [None]:
no_smoke = filtered_df[filtered_df['device_category'] != 'smoke_detector']
no_smoke_strat, _, _, _ = train_test_split(no_smoke.iloc[:,:-1], no_smoke.iloc[:,-1], random_state=1, train_size=1, no_smoke.iloc[:,-1])

smoke = filtered_df[filtered_df['device_category'] != 'smoke_detector']
smoke_500 = pd.concat([smoke, smoke, smoke], ignore_index=True, axis=0)
smoke_500 = smoke_500.iloc[:,:-1]

In [None]:
no_smoke_strat.shape

In [None]:
smoke_500.shape

In [None]:
kmeans = KMeans(n_clusters=10, random_state=1).fit(x)