# Smart Building System


*Accuracy : 94.40137831403612*

<!-- *F1 Score : 0.3956073003400257* -->
F1 Score : 0.38970552797725183

*------------------------------------------*



In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from pathlib import Path
import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

In [None]:
paths = list(Path('/kaggle/input/smart-building-system/KETI/').rglob('*.*'))

## Features path
co2_paths = [path for path in paths if path.name == 'co2.csv']
humidity_paths = [path for path in paths if path.name == 'humidity.csv']
temperature_paths = [path for path in paths if path.name == 'temperature.csv']
light_paths = [path for path in paths if path.name == 'light.csv']


## Target path
pir_paths = [path for path in paths if path.name == 'pir.csv']

frames = []
for light_path, temperature_path, co2_path, pir_path, humidity_path in zip(light_paths, temperature_paths, co2_paths, pir_paths, humidity_paths):
    
    ## Features
    light_df = pd.read_csv(light_path, names=['unix_time', 'light'], index_col='unix_time')
    temperature_df = pd.read_csv(temperature_path, names=['unix_time', 'temperature'], index_col='unix_time')
    co2_df = pd.read_csv(co2_path, names=['unix_time', 'co2'], index_col='unix_time')
    humidity_df = pd.read_csv(humidity_path, names=['unix_time', 'humidity'], index_col='unix_time')
    
    ##Target
    pir_df = pd.read_csv(pir_path, names=['unix_time', 'pir'], index_col='unix_time')
    
    ##Adding into single label
    df = pd.concat([light_df, temperature_df, co2_df, pir_df, humidity_df], axis=1)
    df['room'] = light_path.parent.name
    frames.append(df)
data = pd.concat(frames)

In [None]:
print("------------------Data ------------------")
print(data.head())
print("---------------------------------------------\n")
print('No of rows:{}'.format(data.shape[0]))

# Data Cleaning

In [None]:
#Define the function
def percent_missing(df):
    data = pd.DataFrame(df)
    df_cols = list(pd.DataFrame(data))
    dict_x = {}
    for i in range(0, len(df_cols)):
        dict_x.update({df_cols[i]: round(data[df_cols[i]].isnull().mean()*100,2)})
    
    return dict_x

missing = percent_missing(data)
df_miss = sorted(missing.items(), key=lambda x: x[1], reverse=True)
print('Percent of missing data')
df_miss

In [None]:
data = data.dropna(subset=['pir'])
data.shape

In [None]:
missing = percent_missing(data)
df_miss = sorted(missing.items(), key=lambda x: x[1], reverse=True)
print('Percent of missing data')
df_miss

In [None]:
#Target
y= data.pop("pir")
y = y.apply(lambda x:0 if x==0 else 1)

#Features
x= data

print("-------------------------------")
print(x.head())
print("-------------------------------")
print(y.head())
print('----------------------')
print('No of rows:{}'.format(x.shape[0]))

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
    
x.room=le.fit_transform(x.room)

print("-------------------------------")
print('Room column dtype: {}'.format(x.room.dtype))
print("-------------------------------")
print(x.head())


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y)

In [None]:
# from sklearn.impute import SimpleImputer

# # Imputation
# my_imputer = SimpleImputer()
# X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
# X_test = pd.DataFrame(my_imputer.transform(X_test))

from sklearn.experimental import enable_iterative_imputer  #

from sklearn.impute import IterativeImputer


In [None]:
imp = IterativeImputer(max_iter=10, 
                       random_state=0,
                      n_nearest_features=2,
                      initial_strategy='most_frequent',
                    add_indicator=True,
                      verbose=1)


X_train = pd.DataFrame(imp.fit_transform(X_train))
X_test = pd.DataFrame(imp.transform(X_test))

In [None]:
X_train.shape

In [None]:
##Accuracy Function
import matplotlib.pyplot as plt  
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix


def acc(clf,y_test,y_pred,X_test):
    print('------------------------------------------')
    print('Accuracy :',accuracy_score(y_test,y_pred)*100)
    print('------------------------------------------')
    print('Precision :',precision_score(y_test, y_pred))
    print('------------------------------------------')
    print('Recall :',recall_score(y_test, y_pred))
    print('------------------------------------------')
    print('Roc Auc :',roc_auc_score(y_test, y_pred))
    print('------------------------------------------')
    print('\n')
    print('------------------------------------------')
    print('F1 Score :',f1_score(y_test, y_pred))
    print('------------------------------------------')
    plot_confusion_matrix(clf, X_test, y_pred)
    plt.show()
    

In [None]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

clf = Pipeline([
('scaler', StandardScaler()),
  ('classification',  XGBClassifier())
])
clf.fit(X_train,y_train)
y_pred= clf.predict(X_test)



In [None]:
acc(clf,y_test, y_pred,X_test)

In [None]:
# from sklearn.linear_model import SGDClassifier

# model= SGDClassifier()

# model.fit(X_train,y_train)

# y_pred= model.predict(X_test)



In [None]:
# acc(model,y_test, y_pred,X_test)

In [None]:
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# model_one =QuadraticDiscriminantAnalysis()


# model_one.fit(X_train,y_train)

# y_pred= model_one.predict(X_test)

# acc(model_one,y_test, y_pred,X_test)

In [None]:
# from sklearn.svm import LinearSVC
# from sklearn.feature_selection import SelectFromModel
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler
    
# model_two = Pipeline([
# #   ('feature_selection', SelectFromModel(LinearSVC(penalty="l1"))),
#     ('scaler', StandardScaler()),
#   ('classification',  XGBClassifier())
# ])



# model_two.fit(X_train,y_train)
# y_pred= model_two.predict(X_test)


# acc(model_two,y_test, y_pred,X_test)




In [None]:
# from sklearn.svm import LinearSVC
# from sklearn.feature_selection import SelectFromModel
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler
    
# # model_two = Pipeline([
# #   ('feature_selection', SelectFromModel(LinearSVC(penalty="l1"))),
# # #     ('scaler', StandardScaler()),
# #   ('classification',  XGBClassifier())
# # ])

# linear = LinearSVC()
# sfm = SelectFromModel(linear)
# sfm.fit(X_train, y_train)

# X_test_transform = sfm.transform(X_test)

# X_train_transform = sfm.transform(X_train)


# # sfm.fit(X_train,y_train)
# # y_pred= sfm.predict(X_test)


# # acc(model_two,y_test, y_pred,X_test)

# X_test_transform.shape

In [None]:
# model_final = XGBClassifier()
# model_final.fit(X_train_transform,y_train)

# y_pred= model_final.predict(X_test_transform)

# acc(model_final,y_test, y_pred,X_test_transform)