# Import Libraries

In [None]:
import pandas as pd
import numpy as np 

import os 
import csv


import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set()

# Download DataSet 

In [None]:
df = pd.DataFrame()


for dirname, _, filenames in os.walk('/kaggle/input'):
            
    if len(filenames) > 1:
        
        room_df = pd.DataFrame()

        room_id = dirname.split('/')[-1]
         
        print('Room ID: ', room_id)

        for i, filename in enumerate(filenames):
            
#             print(filename)
            
            feat_name = filename.split('.')[0]
            tmp_df = pd.read_csv(os.path.join(dirname, filename), names=['TS_'+feat_name, feat_name], header=0)        
            
            if i == 0:
                room_df = tmp_df
                
            else:

                room_df = pd.concat([room_df, tmp_df], axis=1, join="outer")
    
        room_df['room_ID'] = [room_id]*room_df.shape[0]
        
        df = pd.concat([df, room_df], axis=0)
        

In [None]:
df.head(2)

# DataSet Preprocessing

In [None]:
print('Records without pir: {:.2f}%'.format(df['pir'].isna().sum()/df.shape[0]*100))

In [None]:
# Drop records without pir value 
df = df[df['pir'].notna()]

## Target Variable

Approximately 6% of the PIR data is non-zero, indicating an occupied status of the room. The remaining 94% of the PIR data is zero, indicating an empty room.

In [None]:
def map_target(x):
    """Target 0: empty room
       Target 1: occupied room"""
    if x == 0 or not x:
        return 0
    else:
        return 1
        
df['target_pir'] = df['pir'].apply(map_target)

In [None]:
df['target_pir'] = df['pir'].apply(map_target)
df['target_pir'].value_counts()/df.shape[0]*100

# unmbalanced class: class 0 is 93% of the entire dataset while class 1 is only 7%

## Check missing values for th other variables

In [None]:
features = [f for f in df.columns if 'TS' not in f and f not in ['room_ID']]

df[features].isna().sum()

In [None]:
df.head(2)

In [None]:
df['PIR_date'] = pd.to_datetime(df['TS_pir'],unit='s')
    
df['PIR_date'] = df['PIR_date'].dt.to_period('D')

occupancy_freq_df = df.groupby(by=['room_ID', 'PIR_date']).agg({'target_pir': lambda x: np.sum(x)/len(x)*100}).unstack()

plt.figure(figsize=(20,5))
sns.heatmap(occupancy_freq_df.T, cmap=sns.color_palette('Blues'), linewidths=0.1, linecolor='white')
plt.xticks(rotation='90')
plt.title('Rooms Occupancy per Day')
plt.show()

In [None]:

features = [f for f in df.columns if 'TS' not in f and f not in ['room_ID', 'pir']]
corr = df[features].corr()


plt.figure(figsize=(15,5))
sns.heatmap(corr, cmap=sns.color_palette('Blues'), linewidths=0.1, linecolor='white', annot=True)
plt.xticks(rotation='90')
plt.title('Correlation Matrix')
plt.show()

In [None]:
features = [f for f in df.columns if 'TS' not in f and f not in ['room_ID', 'pir', 'PIR_Year_Month', 'PIR_date']]
features

In [None]:
plt.figure(figsize=(10,10))
for i, feature in enumerate(features[0:-1]):
    plt.subplot(2,2,i+1)
    sns.boxplot(data=df, y=feature, x='target_pir')
plt.show()

In [None]:
var = 'light'
var_lim = 1000
df[df[var]>=var_lim].shape

clean_df = df[df[var] < var_lim]

In [None]:
var = 'temperature'
var_lim = 100
df[df[var]>=var_lim].shape

clean_df = clean_df[clean_df[var] < var_lim]

In [None]:
var = 'co2'
var_lim = 1000
df[df[var]>=var_lim].shape

clean_df = clean_df[clean_df[var] < var_lim]

In [None]:
var = 'humidity'
var_lim = 30
df[df[var] <= var_lim].shape

clean_df = clean_df[clean_df[var] > var_lim]

In [None]:
plt.figure(figsize=(10,10))
for i, feature in enumerate(features[0:-1]):
    plt.subplot(2,2,i+1)
    sns.boxplot(data=clean_df, y=feature, x='target_pir')
plt.show()

## Predictive Model  

In [None]:
df = df[features]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, plot_roc_curve, plot_confusion_matrix


In [None]:
X = clean_df[[f for f in features if f != 'target_pir']]
y = clean_df['target_pir']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, stratify=y)

In [None]:
X_train.shape, X_test.shape

In [None]:

randomforest_clf = RandomForestClassifier(max_depth=25, n_estimators=100)

randomforest_clf.fit(X_train, y_train)

In [None]:
y_pred = randomforest_clf.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
plot_roc_curve(randomforest_clf, X_test, y_test)

In [None]:
plot_confusion_matrix(randomforest_clf, X_test, y_test, normalize='true', cmap='Blues')