In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('smart_home_device_usage_data.csv')
df.head()

Unnamed: 0,UserID,DeviceType,UsageHoursPerDay,EnergyConsumption,UserPreferences,MalfunctionIncidents,DeviceAgeMonths,SmartHomeEfficiency
0,1,Smart Speaker,15.307188,1.961607,1,4,36,1
1,2,Camera,19.973343,8.610689,1,0,29,1
2,3,Security System,18.911535,2.651777,1,0,20,1
3,4,Camera,7.011127,2.341653,0,3,15,0
4,5,Camera,22.610684,4.859069,1,3,36,1


In [4]:
df.describe()

Unnamed: 0,UserID,UsageHoursPerDay,EnergyConsumption,UserPreferences,MalfunctionIncidents,DeviceAgeMonths,SmartHomeEfficiency
count,5403.0,5403.0,5403.0,5403.0,5403.0,5403.0,5403.0
mean,2702.0,12.052992,5.054302,0.511753,2.066445,30.312234,0.376643
std,1559.856083,6.714961,2.878941,0.499908,1.423291,16.990525,0.484589
min,1.0,0.501241,0.101562,0.0,0.0,1.0,0.0
25%,1351.5,6.297871,2.524968,0.0,1.0,15.0,0.0
50%,2702.0,11.903768,5.007047,1.0,2.0,30.0,0.0
75%,4052.5,17.791751,7.611912,1.0,3.0,45.0,1.0
max,5403.0,23.987326,9.998071,1.0,4.0,59.0,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5403 entries, 0 to 5402
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   UserID                5403 non-null   int64  
 1   DeviceType            5403 non-null   object 
 2   UsageHoursPerDay      5403 non-null   float64
 3   EnergyConsumption     5403 non-null   float64
 4   UserPreferences       5403 non-null   int64  
 5   MalfunctionIncidents  5403 non-null   int64  
 6   DeviceAgeMonths       5403 non-null   int64  
 7   SmartHomeEfficiency   5403 non-null   int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 337.8+ KB


In [6]:
df.isnull().sum()

UserID                  0
DeviceType              0
UsageHoursPerDay        0
EnergyConsumption       0
UserPreferences         0
MalfunctionIncidents    0
DeviceAgeMonths         0
SmartHomeEfficiency     0
dtype: int64

In [7]:
df['DeviceType'].value_counts()

DeviceType
Smart Speaker      1108
Camera             1101
Lights             1087
Security System    1068
Thermostat         1039
Name: count, dtype: int64

In [8]:
# thermostat devices include HVAC,AC,Heaters etc
# dropping user_id column 

df = df.drop(columns =['UserID'])
df.head()

Unnamed: 0,DeviceType,UsageHoursPerDay,EnergyConsumption,UserPreferences,MalfunctionIncidents,DeviceAgeMonths,SmartHomeEfficiency
0,Smart Speaker,15.307188,1.961607,1,4,36,1
1,Camera,19.973343,8.610689,1,0,29,1
2,Security System,18.911535,2.651777,1,0,20,1
3,Camera,7.011127,2.341653,0,3,15,0
4,Camera,22.610684,4.859069,1,3,36,1


In [10]:
df.shape

(5403, 7)

In [15]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

In [11]:
X = df.drop('SmartHomeEfficiency', axis=1)
y = df['SmartHomeEfficiency']

In [13]:
num_features = X.select_dtypes(exclude='object').columns
cat_features = X.select_dtypes(include='object').columns

In [16]:
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer([('OneHotEncoder',cat_transformer,cat_features),('StandardScalar',num_transformer,num_features),])
X = preprocessor.fit_transform(X)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train.shape, X_test.shape

((3620, 10), (1783, 10))

In [19]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

model = LogisticRegression()
model.fit(X_train,y_train)

In [20]:
y_pred_train_lr = model.predict(X_train)

y_pred_test_lr = model.predict(X_test)

accuracy_train = accuracy_score(y_pred_train_lr,y_train)
print(accuracy_train)

0.8773480662983425


In [22]:
confusion_matrix = confusion_matrix(y_pred_train_lr,y_train)
print(confusion_matrix)

[[2042  223]
 [ 221 1134]]


In [23]:
print(classification_report(y_pred_train_lr,y_train))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90      2265
           1       0.84      0.84      0.84      1355

    accuracy                           0.88      3620
   macro avg       0.87      0.87      0.87      3620
weighted avg       0.88      0.88      0.88      3620



In [25]:
# Logistic regression on Test set
print(classification_report(y_pred_test_lr,y_test))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89      1095
           1       0.83      0.82      0.82       688

    accuracy                           0.86      1783
   macro avg       0.86      0.85      0.86      1783
weighted avg       0.86      0.86      0.86      1783



In [26]:
model_rf = RandomForestClassifier()

model_rf.fit(X_train,y_train)

y_pred_train = model_rf.predict(X_train)

y_pred_test = model_rf.predict(X_test)

In [27]:
print(classification_report(y_pred_train,y_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2264
           1       1.00      1.00      1.00      1356

    accuracy                           1.00      3620
   macro avg       1.00      1.00      1.00      3620
weighted avg       1.00      1.00      1.00      3620



In [28]:
print(classification_report(y_pred_test,y_test))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96      1126
           1       0.92      0.95      0.94       657

    accuracy                           0.95      1783
   macro avg       0.95      0.95      0.95      1783
weighted avg       0.95      0.95      0.95      1783



<font face='Arial' color='black' size='4'>
    The randomforest model is doing really well with the train and the test sets. So finalising the Random Forest model
    </font>