In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Loading Data Set and Basic Data Exploration

In this part I load dataset with pandas library and do some basic exploration to the dataset. In There we have found that dataset has 405184 rows and 8 columns. There are following columns in the dataset:
          
          1. ts (timestamp) ==> epoch
          2. device id ==> object
          3. CO (Carbon Monoxide) in ppm ==> float64
          4. humidity in percent ==> float64
          5. light ==> bool
          6. LPG (liquified Petroleum Gas) in ppm ==> float64
          7. motion ==> bool
          8. smoke in ppm ==> float64
          9. Temperature in Fahrenheit ==> float64
          
There are no missing value in this datasset. 

In [None]:
path = '../input/environmental-sensor-data-132k/iot_telemetry_data.csv'
dataIot = pd.read_csv(path, index_col='ts')
dataIot.head()

In [None]:
dataIot.shape

In [None]:
dataIot.describe()

In [None]:
dataIot.info()

In [None]:
dataIot.isnull().sum()

# Data Preprocessing

Before I am analyzing data in to more detail, I do some preprocessing to make data ready for more detail analysis. I remove **motion** column because its only contains False value and I assume it can't help to predict result we want. Then I encode a label for some columns. There are **device** and **light** columns. It's because the data in device column represent environment condtion that we want to predict. There are the following data on device and the label to represent it.

        1. 00:0f:00:70:91:0a (stable conditions, cooler and more humid) ==> 0
        2. 1c:bf:ce:15:ec:4d (highly variable temperature and humidity) ==> 1
        3. b8:27:eb:bf:9d:51 (stable conditions, warmer and dryer) ==> 2

And for the light column I represent the absence of light by label 0 (False) and 1 (True). It is because data on the light column has boolean data type.

In [None]:
data = dataIot.copy()
data = data.drop(['motion'], axis = 1)
data.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

encLab = LabelEncoder()
deviceEnc = pd.DataFrame(encLab.fit_transform(data['device']), columns=['condition'])
deviceEnc.index = data.index

dataEnc = pd.concat([data, deviceEnc], axis=1)
dataEnc.head()

In [None]:
encLab2 = LabelEncoder()
lightEnc = pd.DataFrame(encLab2.fit_transform(dataEnc['light']), columns=['lights'])
lightEnc.index = dataEnc.index

dataEnc2 = pd.concat([dataEnc, lightEnc], axis=1)
dataEnc2.head()

In [None]:
dataProcessed = dataEnc2.drop(['device', 'light'], axis=1)
dataProcessed.head()

# Exploratory Data Analysis

## Univariate Analysis

In [None]:
for i in dataProcessed.columns:
    if dataProcessed[i].nunique() > 5:
        plt.figure(figsize=(8, 6))
        sns.kdeplot(x=i, data=dataProcessed)
        plt.show()

In [None]:
def barplot(columnname):
    val = dataProcessed[columnname]
    valCount = val.value_counts()
    
    plt.figure(figsize=(10, 6))
    sns.barplot(valCount.index, valCount)
    plt.title(columnname)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
for i in dataProcessed.columns:
    if dataProcessed[i].nunique() <= 5:
        barplot(i)

## Multivariate Analysis

In [None]:
for k in dataProcessed.columns:
    if dataProcessed[k].nunique() > 5:
        plt.figure(figsize=(8, 6))
        sns.boxplot(x=dataProcessed['condition'], y=dataProcessed[k])
        plt.show()
        

In [None]:
dataCorr = dataProcessed.drop(['lights', 'condition'], axis=1)
corr = dataCorr.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, linewidths=.5)

## Data Cleaning

 I am removed some columns that high correlation coefficient. Also, I delete outliers in numerical data in dataset. Then, I standardize the value of cleaned using standard scaler form scikit learn. 

In [None]:
dataCleanInput = dataProcessed.copy()
removedCol = ['lpg', 'smoke', 'co']

dataCleaned = dataCleanInput.drop(removedCol, axis=1)
dataCleaned.head()

In [None]:
Q1 = dataCleaned.quantile(.25)
Q3 = dataCleaned.quantile(.75)
IQR = Q3-Q1
dataClean = dataCleaned[~((dataCleaned<(Q1-1.5*IQR))|(dataCleaned>(Q3+1.5*IQR))).any(axis=1)]
dataClean.head()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler1 = StandardScaler()
humidScaled =pd.DataFrame(scaler1.fit_transform(dataClean[['humidity']]), columns=['humid'])
humidScaled.index = dataClean.index
dataClean1 = pd.concat([dataClean, humidScaled], axis=1)
dataClean1.head()

In [None]:
scaler2 = StandardScaler()
tempScaled =pd.DataFrame(scaler1.fit_transform(dataClean[['temp']]), columns=['temperature'])
tempScaled.index = dataClean1.index
dataClean2 = pd.concat([dataClean1, tempScaled], axis=1)
dataClean2.head()

In [None]:
y = dataClean2['condition']
X = dataClean2[['temp', 'humid', 'lights']]
X

## Data Splitting

First, I split whole dataset into two parts there are training dataset and test dataset with the proportion training (80%) and test (20%). After that, I do split the training dataset into two parts : training data (80%) and validation data (20%). 

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=0)


In [None]:
xTrain, xVal, yTrain, yVal = train_test_split(x_train, y_train,test_size=0.2, random_state=0 )

## Machine Learning Modelling

In this section I make baseline model to know what optimal what appropriate the model to classify environment condition and optimum hyperparameter. I try to solve the problem with Random Forest Classifier. Baseline model itself trained with training dataset and verify with validation dataset. Then, I evaluate the baseline model and model with confusion matrix, precision and recall.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

model = RandomForestClassifier(random_state=0, max_depth=1)
model.fit(xTrain, yTrain)
predResult = model.predict(xVal)
report = classification_report(yVal, predResult)
confMat = confusion_matrix(yVal, predResult)
print(report)
print(confMat)

In [None]:
model = RandomForestClassifier(random_state=0, max_depth=1)
model.fit(x_train, y_train)
prediction = model.predict(x_train)
reports = classification_report(y_train, prediction)
conf = confusion_matrix(y_train, prediction)
print(reports)
print(conf)

## Discussion 

From the analysis above we know some features can be removed to gain better performance classifier. These removed features are co, lpg and smoke. It is because these features have high correlation coefficient among them. Hence, we can build a model only with humidity, temperature and lights features. 