In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from pandas.api.types import is_object_dtype

import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn import metrics

from sklearn.ensemble import IsolationForest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

%matplotlib inline
pd.set_option("display.max_columns",300)
pd.set_option("display.max_rows",300)


ModuleNotFoundError: No module named 'sklearn.cross_validation'

In [None]:

def feature_rank(train,model,n):
    cols=train.columns
    col_indices = np.argsort(model.feature_importances_)[::-1]
    feature_ranking_gb = pd.DataFrame(columns=['indicie', 'variable', 'importance'])
    gb_top_col_list = []
    for f in range(n): 
        z = pd.DataFrame([col_indices[f],cols[col_indices[f]],model.feature_importances_[col_indices[f]]]).transpose()
        z.columns = ['indicie', 'variable', 'importance']
        gb_top_col_list.append(cols[col_indices[f]])
        feature_ranking_gb = feature_ranking_gb.append(z)
    return feature_ranking_gb

### Data Cleaning

The data included is relatively clean other than for null values, so we are going ot just remove the rows with null values as there are not many of them

In [None]:
## Copied JX's code to clean df
## Need only be run once to clean the data, the csv will be used later
df=pd.read_csv('/Users/randyjulian/Downloads/hackathon_IoT_training_set_based_on_01mar2017.csv')

# separating independent and dependent variables
ind = df.iloc[:, :-1]
dep = df.iloc[:, -1]

ind = ind.apply(lambda x: pd.to_numeric(x, errors='coerce'))
df2 = pd.concat([ind, dep], axis=1)
no_nans = df2.isnull().sum()
no_nans.to_csv('number of nans per column.csv')
cleaned_df = df2.dropna(axis=0, how='any')

In [None]:
# Dumping cleaned df as pickle files
joblib.dump(cleaned_df,'cleaned_df.pkl',compress=3)

In [None]:
# reading pickle files for re-use
cleaned_df=joblib.load('cleaned_df.pkl')

### Baseline Model (RandomForest)

In [None]:
# training data
train_init, test_init, train_init_y, test_init_y= train_test_split(cleaned_df.iloc[:,0:-1],cleaned_df.device_category, test_size=0.3,random_state=21374)

In [None]:
#building initial/baseline randomforest model
rf_init=RandomForestClassifier(n_estimators=50, verbose=10,class_weight='balanced',random_state=21374)
rf_init.fit(train_init,train_init_y)

In [None]:
# exporting top 50 var from RF to csv
feature_rank(train_init,rf_init,50).to_csv('top50rfinit.csv')

In [None]:
rf_init.score(test_init,test_init_y)

### Correlation Reduction for Highly Correlated Data

Since the code is meant to run for binary variable instead of multiclass problems, I ran the correlation reduction using one-vs-rest method. Meaning, I ran the code 10 times:
1. For each run, I relabel one device_category as 1 and the rest as 0 (eg. water sensor)
2. It gives me a set of reduced variables when water_sensor is classified as 1 and the rest as 0
3. The method is repeated for each device_category, find the intersection of all 10 set of reduced variables as the final variables used in the model

In [None]:
def correlation_reduction(corrMatrix, threshold=0.5): 
    corrmat = abs(corrMatrix.copy(deep=True))

    #Correlation Threshold
    for col in corrMatrix: 
        if col in corrMatrix.keys(): 
            thisCol=[]
            thisVars=[]
            for i in range(len(corrMatrix)): 
                if (abs(corrMatrix[col][i])==1.0) and (col != corrMatrix.keys()[i]): 
                    thisCorr=0
                else: 
                    #tag the highly corelated one as positive and the rest as negative
                    thisCorr = (1 if abs(corrMatrix[col][i])>threshold else -1) * abs(target[corrMatrix.keys()[i]].values[0])
                thisCol.append(thisCorr)
                thisVars.append(corrMatrix.keys()[i])
            mask = np.ones(len(thisCol), dtype=bool)
            ctDelCol = 0 
            for n, j in enumerate(thisCol): 
                mask[n]=not (j!=max(thisCol) and j >=0)
                 # keep the max, remove the rest of highly correlated ones
                if j !=max(thisCol) and j>=0: 
                    corrMatrix.pop('%s' %thisVars[n])
                    target.pop('%s' %thisVars[n])
                    ctDelCol +=1
            corrMatrix=corrMatrix[mask]
    corrmat = abs(corrMatrix.copy(deep=True))
    return corrmat

In [None]:
# Re-run from here
cleaned_copy=cleaned_df.copy(deep=True)

In [None]:
## change the x value accordingly for each class. 
## It is one-vs-rest problem so the value 1 will be assigned to one class for each model

# cleaned_copy.device_category = cleaned_copy.device_category.apply(lambda x: 0 if x != 'water_sensor' else 1) 
# cleaned_copy.device_category = cleaned_copy.device_category.apply(lambda x: 0 if x != 'thermostat' else 1) 
# cleaned_copy.device_category = cleaned_copy.device_category.apply(lambda x: 0 if x != 'socket' else 1) 
# cleaned_copy.device_category = cleaned_copy.device_category.apply(lambda x: 0 if x != 'lights' else 1) 
# cleaned_copy.device_category = cleaned_copy.device_category.apply(lambda x: 0 if x != 'TV' else 1) 
# cleaned_copy.device_category = cleaned_copy.device_category.apply(lambda x: 0 if x != 'motion_sensor' else 1) 
# cleaned_copy.device_category = cleaned_copy.device_category.apply(lambda x: 0 if x != 'security_camera' else 1) 
# cleaned_copy.device_category = cleaned_copy.device_category.apply(lambda x: 0 if x != 'watch' else 1) 
#cleaned_copy.device_category = cleaned_copy.device_category.apply(lambda x: 0 if x != 'smoke_detector' else 1) 
cleaned_copy.device_category = cleaned_copy.device_category.apply(lambda x: 0 if x != 'baby_monitor' else 1) 


In [None]:
%%time
npMatrix = pd.DataFrame(np.corrcoef(cleaned_copy,rowvar=0))
npMatrix.columns = cleaned_copy.columns
npMatrix.index = cleaned_copy.columns

In [None]:
## forming the new correlation matrix to be feed into the correlation reduction function
corrMatrix = npMatrix.copy(deep=True)
target=corrMatrix[['device_category']].T
corrMatrix.drop('device_category',axis=1,inplace=True)
corrMatrix.drop('device_category',axis=0,inplace=True)

corrMatrix.dropna(how='all',axis=[0,1], inplace=True)

redcorrmat=correlation_reduction(corrMatrix)

In [None]:
redcorrmat.shape

In [None]:
### here I changed the name of the list accordingly
# water_sensor_reduced=list(redcorrmat)
# thermostat_reduced=list(redcorrmat)
# socket_reduced=list(redcorrmat)
# lights_reduced=list(redcorrmat)
# TV_reduced=list(redcorrmat)
# motion_sensor_reduced=list(redcorrmat)
# security_camera_reduced=list(redcorrmat)
# watch_reduced=list(redcorrmat)
# smoke_detector_reduced=list(redcorrmat)
baby_monitor_reduced=list(redcorrmat)


In [None]:
### Finding the intersect of all the list of reduced variables from each class
list(set(thermostat_reduced).intersection(socket_reduced,lights_reduced,TV_reduced,baby_monitor_reduced,motion_sensor_reduced,security_camera_reduced,watch_reduced,water_sensor_reduced,smoke_detector_reduced))

### Random Forest Models

We built 3 models for this competition to compare the accuracy and effectiveness of these models:
1. rf_20 is built with the top 20 variables obtained from the baseline model
2. rf_37 is built with the 37 variables obtained from the correlation reduction section
3. rf_57 is built combining both the top 20 variables and the 37 correlation reduction variables

In [None]:
## RF model for the top 20 variable only
rf_20=RandomForestClassifier(n_estimators=50, verbose=10,class_weight='balanced')
rf_20.fit(train_init.loc[:,list(feature_rank(train_init,rf_init,20).variable)],train_init_y)

In [None]:
## RF model for the 37 variable from correlation reduction only
rf_37=RandomForestClassifier(n_estimators=50, verbose=10,class_weight='balanced')
rf_37.fit(train_init.loc[:,list(set(thermostat_reduced).intersection(socket_reduced,lights_reduced,TV_reduced,baby_monitor_reduced,motion_sensor_reduced,security_camera_reduced,watch_reduced,water_sensor_reduced,smoke_detector_reduced))],train_init_y)

In [None]:
### Combined list from top 20 variables (Random Forest) and 37 variables (Correlation Reduction)
combined_list=list(set(list(set(thermostat_reduced).intersection(socket_reduced,lights_reduced,TV_reduced,baby_monitor_reduced,motion_sensor_reduced,security_camera_reduced,watch_reduced,water_sensor_reduced,smoke_detector_reduced))).union(list(feature_rank(train_init,rf_init,20).variable)))

In [None]:
rf_57=RandomForestClassifier(n_estimators=50, verbose=10,class_weight='balanced')
rf_57.fit(train_init.loc[:,combined_list],train_init_y)

In [None]:
rf_57.score(test_init.loc[:,combined_list], test_init_y)

In [None]:
print(metrics.confusion_matrix(test_init_y,rf_57.predict(test_init.loc[:,combined_list])))

### Resampled Data (resampled data are labelled as xcv..)

The original half a million data have very small ratio of unknown devices and due to the time constraint, training a data on 500,000 rows are taking too much time. We resample the data to allow us to have a 80:20 ratio of unknown devices so the model can learn better and also to limit the learning to lesser amount of data. 

In [None]:
# Resample dataset 1
xcv1=joblib.load('/Users/randyjulian/Downloads/sample (1).pkl')

# Resample dataset 2
xcv2=joblib.load('/Users/randyjulian/Downloads/sample2.pkl')



### IsolationForest

IsolationForest is an algorithm used to detect anomalies by giving each observation a score depending if the algorithm consider the observation as an anomaly.

The IsolationForest algorithm is used in this problem to measure how well we recognize a device. The process is explained below:
1. We train the IsolationForest to learn each device. Hence we built 10 IsolationForest models, one for each device_category. For each model, we only feed in one particular device (eg: lights) in training.
2. So we have 10 models that know really well each of their own device, hence we can use this as a "recognition" model eg: feeding a motion_sensor data to a lights model will return us as unknown/0 and feeding a lights data will return us a known/1.
3. We ran the observation to all the 10 models to give us a vector of 0,1 (0 for unknown, 1 for known) and we sum this binary values across each observation.
4. By theory, for any observations, if they belong to any known device_category, the sum should be greater than 1. Hence, the devices whose sum is 0 will be defined as the unknown device_category.

For the code below, since we don't know how unknown devices are identified, we select one device_category as "unknown" (eg below is lights) in order for us to validate the results of this ensemble.

In [None]:
xcv21=xc2[xc2.device_category!= 'lights']
xcv22=xc2[xc2.device_category== 'lights']

In [None]:
## I removed lights here to be the unknown class
columns=['security_camera','TV','smoke_detector','thermostat','water_sensor','watch','baby_monitor','motion_sensor','socket']
top15rf=list(feature_rank(train_init,rf_init,15).variable)

def check_unknown(columns,dftrain,dftest,top15var):
    df_final=pd.DataFrame()
    model_list=[]
    n=0
    for device in columns:
        dfunknown=dftrain[dftrain.device_category== device]
        iso=IsolationForest(n_estimators=300, contamination=0.1)
        iso.fit(dfunknown.loc[:,top15var])
        df_results= pd.DataFrame(iso.predict(dftest.loc[:,top15var]))
        df_final[device] = iso.predict(dftest.loc[:,top15var])
    df_final['sum_iso']=df_final.apply(sum,axis=1)
    #### This line need to be removed for submission, but it need to be included for validation to measure accuracy
    #df_final['class']=dftest.device_category.reset_index().device_category
    identification_list = []
    for entry in range(len(df_final)):
        data= df_final.iloc[entry]
        if data['sum_iso'] == -len(columns):
            identity = "unknown"
        else:
            identity = "known"
        identification_list.append(identity)
    df_identification = pd.DataFrame(identification_list,columns=['identity'])
    df_final = pd.merge(df_final,df_identification,left_index=True, right_index=True)
    return df_final    
        
    

In [None]:
df_test=check_unknown(columns, xcv1, xcv2, top15rf)

In [None]:
## to create the validation data to check accuracy of the isolationForest
df_test['true_value']= df_test.apply((lambda x: 'unknown' if x[10]== 'lights' else 'known'), axis=1)

In [None]:
# confusion matrix to see the accuracy of the model
first_model_cm=metrics.confusion_matrix(df_test.true_value,df_test.identity)

In [None]:
df_torun=df_test[df_test.identity=='known']

In [None]:
print(rf_20.score(test_init.loc[:,list(feature_rank(train_init,rf_init,20).variable)],test_init_y))
print(rf_37.score(test_init.loc[:,list(set(thermostat_reduced).intersection(socket_reduced,lights_reduced,TV_reduced,baby_monitor_reduced,motion_sensor_reduced,security_camera_reduced,watch_reduced,water_sensor_reduced,smoke_detector_reduced))],test_init_y))
print(rf_57.score(test_init.loc[:,combined_list],test_init_y))

In [None]:
joblib.dump(rf_57,'rf_57.pkl')

In [None]:
len(df_torun.index.tolist())

In [None]:
known_list=df_torun.index.tolist()

In [None]:
xctrain1=xcv2.iloc[known_list,:]
xctrain2=xctrain1.loc[:,combined_list]

In [None]:
result_list=rf_57.predict(xctrain2)

In [None]:
metrics.confusion_matrix(df_torun['class'][df_torun.identity=='known'],result_list)

### Isolation Forest Iter 2

In [None]:
xcv3=joblib.load('sample3.pkl')
xcv4=joblib.load('sample4.pkl')
xcv5=joblib.load('sample5.pkl')

In [None]:
xc34=pd.concat([xcv3,xcv4], axis=0,ignore_index=True)

In [None]:
columns

In [None]:
df_test1=check_unknown(columns, xc34, xcv5, top10rf)

In [None]:
df_test1['true_value']= df_test1.apply((lambda x: 'unknown' if x[10]== 'lights' else 'known'), axis=1)

In [None]:
second_model_cm=metrics.confusion_matrix(df_test1.true_value,df_test1.identity)

In [None]:
second_model_cm

In [None]:
first_model_cm

In [None]:
print(metrics.classification_report(df_test1.true_value,df_test1.identity))

In [None]:
print(metrics.classification_report(df_test.true_value,df_test.identity))

### Validation Set

In [None]:
validation=pd.read_csv('/Users/randyjulian/Downloads/hackathon_IoT_validation_set_based_on_01mar2017_ANONYMIZED.csv')

In [None]:
submission_df.identity.value_counts()

In [None]:
submission_df=check_unknown(list(cleaned_df.device_category.unique()),xcv1,validation,top15rf)

In [None]:
submission_df2=check_unknown(list(cleaned_df.device_category.unique()),xc34,validation,top15rf)

In [None]:
def model_score(submission_df, validation_df,model,xgboost_indicator):
    known_df=submission_df[submission_df.identity=='known']
    known_list=known_df.index.tolist()
    
    train1=validation_df.iloc[known_list,:]
    train2=train1.loc[:,combined_list]
    if xgboost_indicator == False:
        result_list=model.predict(train2)
        return list(result_list),known_list
    else:
        dval = xgb.DMatrix(data=train2)
        xgb_pred = model.predict(dval)
        return list(xgb_pred), known_list

In [None]:
result_list,known_list=model_score(submission_df,validation,rf_57)

In [None]:
result_list_37,known_list_37=model_score(submission_df,validation,rf_37)

In [None]:
def cat_score(submission_df,result_list,known_list):
    identity_list = []
    position = 0 
    for entry in range(len(submission_df)):
        data=submission_df.iloc[entry]
        if entry in known_list:
            identity = result_list[position]
            position +=1
        else:
            identity = "unknown"
        identity_list.append(identity)
    
    df_identity = pd.DataFrame(identity_list,columns=["final_identity"])
    df_final = pd.merge(submission_df, df_identity,left_index=True, right_index=True)
    return df_final
            

In [None]:
df_final_submission = cat_score(submission_df,result_list,known_list)

In [None]:
df_alan1 = df_final_submission.final_identity.to_frame()
df_alan2 = validation.session_ind.to_frame()

In [None]:
df_submission_confirm = pd.merge(df_alan1,df_alan2, left_index=True, right_index=True)

In [None]:
df_submission_confirm.columns = ["device_category", "session_ind"]

In [None]:
df_submission_confirm.set_index('device_category',inplace=True)

In [None]:
df_submission_confirm.to_csv('submission_1.csv')

In [None]:
df_final_submission_37 = cat_score(submission_df,result_list_37,known_list_37)
df_alan1_37 = df_final_submission_37.final_identity.to_frame()
df_alan2_37 = validation.session_ind.to_frame()
df_submission_confirm_37 = pd.merge(df_alan1_37,df_alan2_37, left_index=True, right_index=True)
df_submission_confirm_37.columns = ["device_category", "session_ind"]
df_submission_confirm_37.set_index('device_category',inplace=True)
df_submission_confirm_37.to_csv('submission_2.csv')

### XGB

In [None]:
xgb_1=joblib.load('xgb_2.pkl')

In [None]:
train2=validation.loc[:,combined_list]
dval = xgb.DMatrix(data=train2)
xgb_pred = xgb_1.predict(dval)


In [None]:
xgb_pred

In [None]:
result_list_xgb,known_list_xgb=model_score(submission_df,validation,xgb_1,True)

In [None]:
encoded_id=joblib.load('encoded_id.pkl')

In [None]:
encoded_id

In [None]:
result_list_xgb_final = list(np.array(result_list_xgb) + 1)

In [None]:
identity_list_2 = []
for i in result_list_xgb_final:
    identity = encoded_id[i]
    identity_list_2.append(identity)


In [None]:
df_final_submission_xgb = cat_score(submission_df,identity_list_2,known_list_xgb)
df_alan1_xgb = df_final_submission_xgb.final_identity.to_frame()
df_alan2_xgb = validation.session_ind.to_frame()
df_submission_confirm_xgb = pd.merge(df_alan1_xgb,df_alan2_xgb, left_index=True, right_index=True)
df_submission_confirm_xgb.columns = ["device_category", "session_ind"]
df_submission_confirm_xgb.set_index('device_category',inplace=True)
df_submission_confirm_xgb.to_csv('submission_3.csv')

### XBG with Iso2 

In [None]:
result_list_xgb2,known_list_xgb2=model_score(submission_df2,validation,xgb_1,True)

In [None]:
result_list_xgb_final2 = list(np.array(result_list_xgb2) + 1)

In [None]:
identity_list_22 = []
for i in result_list_xgb_final2:
    identity = encoded_id[i]
    identity_list_22.append(identity)


In [None]:
df_final_submission_xgb2 = cat_score(submission_df2,identity_list_22,known_list_xgb2)
df_alan1_xgb2 = df_final_submission_xgb2.final_identity.to_frame()
df_alan2_xgb2 = validation.session_ind.to_frame()
df_submission_confirm_xgb2 = pd.merge(df_alan1_xgb2,df_alan2_xgb2, left_index=True, right_index=True)
df_submission_confirm_xgb2.columns = ["device_category", "session_ind"]
df_submission_confirm_xgb2.set_index('device_category',inplace=True)
df_submission_confirm_xgb2.to_csv('submission_4.csv')

In [None]:
submission_df2.identity.value_counts()