ML Project

In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [74]:
# Load dataset
df = pd.read_csv("dataset/HomeC.csv")  # Change filename if needed

  df = pd.read_csv("dataset/HomeC.csv")  # Change filename if needed


In [75]:
df

Unnamed: 0,time,use [kW],gen [kW],House overall [kW],Dishwasher [kW],Furnace 1 [kW],Furnace 2 [kW],Home office [kW],Fridge [kW],Wine cellar [kW],...,visibility,summary,apparentTemperature,pressure,windSpeed,cloudCover,windBearing,precipIntensity,dewPoint,precipProbability
0,1451624400,0.932833,0.003483,0.932833,0.000033,0.020700,0.061917,0.442633,0.124150,0.006983,...,10.00,Clear,29.26,1016.91,9.18,cloudCover,282.0,0.0000,24.40,0.00
1,1451624401,0.934333,0.003467,0.934333,0.000000,0.020717,0.063817,0.444067,0.124000,0.006983,...,10.00,Clear,29.26,1016.91,9.18,cloudCover,282.0,0.0000,24.40,0.00
2,1451624402,0.931817,0.003467,0.931817,0.000017,0.020700,0.062317,0.446067,0.123533,0.006983,...,10.00,Clear,29.26,1016.91,9.18,cloudCover,282.0,0.0000,24.40,0.00
3,1451624403,1.022050,0.003483,1.022050,0.000017,0.106900,0.068517,0.446583,0.123133,0.006983,...,10.00,Clear,29.26,1016.91,9.18,cloudCover,282.0,0.0000,24.40,0.00
4,1451624404,1.139400,0.003467,1.139400,0.000133,0.236933,0.063983,0.446533,0.122850,0.006850,...,10.00,Clear,29.26,1016.91,9.18,cloudCover,282.0,0.0000,24.40,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503906,1452128306,1.599333,0.003233,1.599333,0.000050,0.104017,0.625033,0.041750,0.005233,0.008433,...,8.74,Light Rain,29.45,1011.49,6.72,0.31,186.0,0.0101,31.27,0.51
503907,1452128307,1.924267,0.003217,1.924267,0.000033,0.422383,0.637733,0.042033,0.004983,0.008467,...,8.74,Light Rain,29.45,1011.49,6.72,0.31,186.0,0.0101,31.27,0.51
503908,1452128308,1.978200,0.003217,1.978200,0.000050,0.495667,0.620367,0.042100,0.005333,0.008233,...,8.74,Light Rain,29.45,1011.49,6.72,0.31,186.0,0.0101,31.27,0.51
503909,1452128309,1.990950,0.003233,1.990950,0.000050,0.494700,0.634133,0.042100,0.004917,0.008133,...,8.74,Light Rain,29.45,1011.49,6.72,0.31,186.0,0.0101,31.27,0.51


In [76]:
# Calculate moving average and standard deviation over a 60-second window
df['moving_average'] = df['House overall [kW]'].rolling(window=60).mean()
df['std_dev'] = df['House overall [kW]'].rolling(window=60).std()

# Define upper and lower thresholds
df['upper_threshold'] = df['moving_average'] + (2 * df['std_dev'])
df['lower_threshold'] = df['moving_average'] - (2 * df['std_dev'])

# Initialize the anomaly flag
df['anomaly_flag'] = 0

In [77]:
# Flag anomalies
df.loc[df['House overall [kW]'] > df['upper_threshold'], 'anomaly_flag'] = 1
df.loc[df['House overall [kW]'] < df['lower_threshold'], 'anomaly_flag'] = 1

In [78]:
# Handle missing values
cloud_cover_index = df[df['cloudCover'] == "cloudCover"].index
df.drop(cloud_cover_index, inplace=True)
df['cloudCover'] = df['cloudCover'].astype(float)
df

Unnamed: 0,time,use [kW],gen [kW],House overall [kW],Dishwasher [kW],Furnace 1 [kW],Furnace 2 [kW],Home office [kW],Fridge [kW],Wine cellar [kW],...,cloudCover,windBearing,precipIntensity,dewPoint,precipProbability,moving_average,std_dev,upper_threshold,lower_threshold,anomaly_flag
58,1451624458,0.714200,0.003417,0.714200,0.000033,0.021083,0.309983,0.043067,0.005167,0.123317,...,0.75,285.0,0.0000,23.90,0.00,,,,,0
59,1451624459,0.497067,0.003417,0.497067,0.000017,0.096983,0.062867,0.043283,0.005000,0.123283,...,0.75,285.0,0.0000,23.90,0.00,1.044130,0.443877,1.931884,0.156375,0
60,1451624460,0.465133,0.003450,0.465133,0.000017,0.064500,0.062633,0.043250,0.005017,0.123350,...,0.75,285.0,0.0000,23.90,0.00,1.036335,0.449930,1.936195,0.136474,0
61,1451624461,0.512933,0.003417,0.512933,0.000017,0.111333,0.063883,0.043300,0.004967,0.123867,...,0.75,285.0,0.0000,23.90,0.00,1.029311,0.454812,1.938936,0.119687,0
62,1451624462,0.651283,0.003417,0.651283,0.000017,0.114583,0.063200,0.043283,0.114167,0.124267,...,0.75,285.0,0.0000,23.90,0.00,1.024636,0.457267,1.939169,0.110103,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503906,1452128306,1.599333,0.003233,1.599333,0.000050,0.104017,0.625033,0.041750,0.005233,0.008433,...,0.31,186.0,0.0101,31.27,0.51,1.439551,0.518371,2.476293,0.402810,0
503907,1452128307,1.924267,0.003217,1.924267,0.000033,0.422383,0.637733,0.042033,0.004983,0.008467,...,0.31,186.0,0.0101,31.27,0.51,1.446714,0.522099,2.490911,0.402516,0
503908,1452128308,1.978200,0.003217,1.978200,0.000050,0.495667,0.620367,0.042100,0.005333,0.008233,...,0.31,186.0,0.0101,31.27,0.51,1.454722,0.526560,2.507842,0.401602,0
503909,1452128309,1.990950,0.003233,1.990950,0.000050,0.494700,0.634133,0.042100,0.004917,0.008133,...,0.31,186.0,0.0101,31.27,0.51,1.467884,0.529975,2.527835,0.407934,0


In [79]:
# Drop rows with NaN values
df.dropna(inplace=True)

In [80]:
df

Unnamed: 0,time,use [kW],gen [kW],House overall [kW],Dishwasher [kW],Furnace 1 [kW],Furnace 2 [kW],Home office [kW],Fridge [kW],Wine cellar [kW],...,cloudCover,windBearing,precipIntensity,dewPoint,precipProbability,moving_average,std_dev,upper_threshold,lower_threshold,anomaly_flag
59,1451624459,0.497067,0.003417,0.497067,0.000017,0.096983,0.062867,0.043283,0.005000,0.123283,...,0.75,285.0,0.0000,23.90,0.00,1.044130,0.443877,1.931884,0.156375,0
60,1451624460,0.465133,0.003450,0.465133,0.000017,0.064500,0.062633,0.043250,0.005017,0.123350,...,0.75,285.0,0.0000,23.90,0.00,1.036335,0.449930,1.936195,0.136474,0
61,1451624461,0.512933,0.003417,0.512933,0.000017,0.111333,0.063883,0.043300,0.004967,0.123867,...,0.75,285.0,0.0000,23.90,0.00,1.029311,0.454812,1.938936,0.119687,0
62,1451624462,0.651283,0.003417,0.651283,0.000017,0.114583,0.063200,0.043283,0.114167,0.124267,...,0.75,285.0,0.0000,23.90,0.00,1.024636,0.457267,1.939169,0.110103,0
63,1451624463,0.710450,0.003433,0.710450,0.000050,0.142050,0.062783,0.043283,0.140700,0.124317,...,0.75,285.0,0.0000,23.90,0.00,1.019443,0.459062,1.937567,0.101318,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503905,1452128305,1.601233,0.003183,1.601233,0.000050,0.085267,0.642417,0.041783,0.005267,0.008667,...,0.31,186.0,0.0101,31.27,0.51,1.438614,0.518128,2.474869,0.402359,0
503906,1452128306,1.599333,0.003233,1.599333,0.000050,0.104017,0.625033,0.041750,0.005233,0.008433,...,0.31,186.0,0.0101,31.27,0.51,1.439551,0.518371,2.476293,0.402810,0
503907,1452128307,1.924267,0.003217,1.924267,0.000033,0.422383,0.637733,0.042033,0.004983,0.008467,...,0.31,186.0,0.0101,31.27,0.51,1.446714,0.522099,2.490911,0.402516,0
503908,1452128308,1.978200,0.003217,1.978200,0.000050,0.495667,0.620367,0.042100,0.005333,0.008233,...,0.31,186.0,0.0101,31.27,0.51,1.454722,0.526560,2.507842,0.401602,0


In [81]:
# Select features and target variable
# X = df[['use [kW]', 'gen [kW]', 'Dishwasher [kW]', 'Furnace 1 [kW]', 'Furnace 2 [kW]',
#         'Home office [kW]', 'Fridge [kW]', 'Wine cellar [kW]', 'Garage door [kW]',
#         'Kitchen 12 [kW]', 'Kitchen 14 [kW]', 'Kitchen 38 [kW]', 'Barn [kW]',
#         'Well [kW]', 'Microwave [kW]', 'Living room [kW]', 'Solar [kW]',
#         'temperature', 'humidity', 'pressure', 'windSpeed', 'cloudCover']]
X = df[['use [kW]', 'temperature', 'humidity', 'pressure', 'windSpeed', 'cloudCover']]
y = df['anomaly_flag']

In [82]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [83]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [84]:
# 1. Logistic Regression
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)
print("Logistic Regression Report:\n",
      classification_report(y_test, y_pred_log))

Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96     92657
           1       0.24      0.01      0.02      8114

    accuracy                           0.92    100771
   macro avg       0.58      0.50      0.49    100771
weighted avg       0.86      0.92      0.88    100771



In [85]:
# 2. Decision Tree
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)
print("Decision Tree Report:\n", classification_report(y_test, y_pred_tree))

Decision Tree Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97     92657
           1       0.64      0.63      0.64      8114

    accuracy                           0.94    100771
   macro avg       0.81      0.80      0.80    100771
weighted avg       0.94      0.94      0.94    100771



In [86]:
# 3. Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Report:\n", classification_report(y_test, y_pred_rf))

Random Forest Report:
               precision    recall  f1-score   support

           0       0.97      0.98      0.97     92657
           1       0.73      0.65      0.69      8114

    accuracy                           0.95    100771
   macro avg       0.85      0.82      0.83    100771
weighted avg       0.95      0.95      0.95    100771



In [None]:
# 4. Support Vector Machine
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("Support Vector Machine Report:\n",
      classification_report(y_test, y_pred_svm))

In [None]:
def plot_variance_ratio(pca):
    plt.plot(range(1, len(pca.explained_variance_ratio_) + 1),
             np.cumsum(pca.explained_variance_ratio_),
             linestyle='-', marker='o')
    plt.grid(linestyle='-', linewidth=1)
    plt.xlim([0, 22])
    plt.ylim([0, 1])

    yticks = np.arange(0, 1.1, 0.1)
    plt.yticks(yticks)

    xticks = np.arange(0, X.shape[1]+1, 1)
    plt.xticks(xticks)

    plt.xlabel('Number of components')
    plt.ylabel('Cumulative explained variance')
    plt.show()

In [None]:
# 5. Dimensionality Reduction using PCA for Random Forest
pca = PCA(n_components=)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
plot_variance_ratio(pca)

In [None]:
# Observing PCA variance ratio effects
pca = PCA().fit(X_train)
plot_variance_ratio(pca)

In [None]:
rf_model_pca = RandomForestClassifier()
rf_model_pca.fit(X_train_pca, y_train)
y_pred_rf_pca = rf_model_pca.predict(X_test_pca)
print("Random Forest with PCA Report:\n",
      classification_report(y_test, y_pred_rf_pca))

In [None]:
# 6. Dimensionality Reduction using PCA for SVM
svm_model_pca = SVC()
svm_model_pca.fit(X_train_pca, y_train)
y_pred_svm_pca = svm_model_pca.predict(X_test_pca)
print("Support Vector Machine with PCA Report:\n",
      classification_report(y_test, y_pred_svm_pca))

In [None]:
# 7. K-Means Clustering (Unsupervised)
kmeans = KMeans(n_clusters=2)  # Assuming 2 clusters for anomalies and normal
df['kmeans_cluster'] = kmeans.fit_predict(X)

# Plot K-Means clustering results
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['temperature'], y=df['House overall [kW]'],
                hue=df['kmeans_cluster'], palette='deep')
plt.title('K-Means Clustering Results')
plt.show()

In [None]:
# 8. Agglomerative Clustering (Unsupervised)
agglo = AgglomerativeClustering(n_clusters=2)
df['agglo_cluster'] = agglo.fit_predict(X)

# Plot Agglomerative Clustering results
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['temperature'], y=df['House overall [kW]'],
                hue=df['agglo_cluster'], palette='deep')
plt.title('Agglomerative Clustering Results')
plt.show()

In [None]:
# 10. Multi-Layer Perceptron
from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(hidden_layer_sizes=(10,), max_iter=1000)
mlp_model.fit(X_train, y_train)
y_pred_mlp = mlp_model.predict(X_test)
print("Multi-Layer Perceptron Report:\n",
      classification_report(y_test, y_pred_mlp))