1. Import Liberaries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from mpl_toolkits.mplot3d import Axes3D


2. Load the main dataset


In [None]:
# Load the main dataset
data_path = 'data.xls'
data = pd.read_excel(data_path)
print(data.head())



3. Check if the first column is date and time, and drop it if necessary


In [None]:

# Check if the first column is date and time, and drop it if necessary
if 'Date' in data.columns or 'Time' in data.columns:
    data = data.drop(columns=['Date', 'Time'], errors='ignore')

4. Select only numeric columns for PCA


In [None]:
# Select only numeric columns for PCA
data_numaric = data.select_dtypes(include=[np.number])

5. Standardize the data


In [None]:
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data_numaric)

6. Perform PCA with a large number of components to analyze variance


In [None]:
# Perform PCA with a large number of components to analyze variance
explained_variance_ratios = []
for i in range(1, len(data_numaric.columns) + 1):
    pca = PCA(n_components=i)
    pca.fit(X_scaled)
    explained_variance_ratios.append(np.sum(pca.explained_variance_ratio_))
    print(f"#Number of components: {i}, Explained variance: {explained_variance_ratios[-1]}")

plt.plot(range(1, len(data_numaric.columns) + 1), explained_variance_ratios, marker='o')
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance ratio')
plt.title('Cumulative Variance Explained by PCA Components')
plt.show()

# Choose number of components based on variance explained (e.g., 18 components for ~88% variance)
n_components = 18
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

# Create a DataFrame for PCA results
pca_columns = [f'PC{i+1}' for i in range(n_components)]
pca_df = pd.DataFrame(data=X_pca, columns=pca_columns)



7. Elbow method to determine the optimal number of clusters


In [None]:
# Elbow method to determine the optimal number of clusters
inertia = []
K = range(1, 11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(K, inertia, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal Number of Clusters')
plt.show()

8. Clustering using K-MEANS

In [None]:
# Clustering using KMeans
kmeans = KMeans(n_clusters=7, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Add cluster labels to the PCA DataFrame
pca_df['Cluster'] = clusters

# 2D visualization of the first two principal components
plt.figure(figsize=(10, 6))
sns.scatterplot(x='PC1', y='PC2', hue='Cluster', palette='viridis', data=pca_df)
plt.title('PCA - Heat Pump Data with KMeans Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.show()

# 3D visualization of the first three principal components
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(pca_df['PC1'], pca_df['PC2'], pca_df['PC3'], c=pca_df['Cluster'], cmap='viridis')
ax.set_title('3D PCA - Heat Pump Data with KMeans Clusters')
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
plt.show()

# Silhouette Score to evaluate clustering
silhouette_avg = silhouette_score(X_scaled, clusters)
print(f'Silhouette Score: {silhouette_avg:.2f}')



9. Plot heatmap of feature importance in PCA


In [None]:
# Plot heatmap of feature importance in PCA
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
loadings_df = pd.DataFrame(loadings, columns=pca_columns, index=data_numaric.columns)

plt.figure(figsize=(12, 8))
sns.heatmap(loadings_df, annot=True, cmap='viridis')
plt.title('PCA Loadings Heatmap')
plt.xlabel('Principal Components')
plt.ylabel('Features')
plt.show()


10. Save the principal components to a CSV file if needed


In [None]:
# Save the principal components to a CSV file if needed
pca_df.to_csv('pca_results.csv', index=False)


11. Scree plot and cumulative variance plot


In [None]:
# Scree plot and cumulative variance plot
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

plt.figure(figsize=(10, 6))
plt.bar(range(1, n_components + 1), explained_variance, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(1, n_components + 1), cumulative_variance, where='mid', label='Cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.title('Scree Plot and Cumulative Explained Variance')
plt.legend(loc='best')
plt.show()


12. Correlation circle for the first two principal components


In [None]:
# Correlation circle for the first two principal components
plt.figure(figsize=(10, 10))
plt.quiver(np.zeros(loadings.shape[0]), np.zeros(loadings.shape[0]), loadings[:, 0], loadings[:, 1], 
           angles='xy', scale_units='xy', scale=1)
for i in range(loadings.shape[0]):
    plt.text(loadings[i, 0], loadings[i, 1], data_numaric.columns[i], color='r')
plt.xlim(-1, 1)
plt.ylim(-1, 1)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('Correlation Circle for PCA')
plt.grid()
plt.axhline(0, color='grey', linestyle='--')
plt.axvline(0, color='grey', linestyle='--')
circle = plt.Circle((0, 0), 1, color='b', fill=False)
plt.gca().add_artist(circle)
plt.show()