In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
# Read the data
df = pd.read_csv('data.csv')

In [3]:
# Convert monthly columns to numeric if they aren't already
months = ['September', 'August', 'July', 'June', 'May', 'April', 'March']
for month in months:
    # Check if conversion is needed
    if df[month].dtype == 'object':
        df[month] = pd.to_numeric(df[month].astype(str).str.replace(',', ''), errors='coerce')
    else:
        df[month] = pd.to_numeric(df[month], errors='coerce')

In [None]:
# Create average monthly consumption
df['avg_consumption'] = df[months].mean(axis=1)

# Extract latitude and longitude
df[['Latitude', 'Longitude']] = df['Latitude,Longitue'].str.extract(r'([\d.]+),\s*([\d.]+)')
df[['Latitude', 'Longitude']] = df[['Latitude', 'Longitude']].astype(float)

print("Data Overview:")
print(df.describe())

In [None]:
# Dropping non-numeric columns
df_numeric = df.select_dtypes(include=[np.number])

# Calculate the correlation matrix
plt.figure(figsize=(10, 8))
correlation_matrix = df_numeric.corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Matrix')
plt.show()

In [6]:
monthly_cols = ['September', 'August', 'July', 'June', 'May', 'April', 'March']
df['avg_consumption'] = df[monthly_cols].mean(axis=1)

In [None]:
# Monthly consumption patterns
plt.figure(figsize=(15, 6))
monthly_avg = df[months].mean()
monthly_std = df[months].std()

plt.errorbar(range(len(months)), monthly_avg, yerr=monthly_std, fmt='o-')
plt.xticks(range(len(months)), months, rotation=45)
plt.title('Monthly Consumption Pattern with Standard Deviation')
plt.ylabel('Average Consumption (units)')
plt.grid(True)
plt.show()

# Calculate month-to-month changes
print("\nMonth-to-month average consumption changes:")
for i in range(len(months)-1):
    change = monthly_avg[i] - monthly_avg[i+1]
    percent_change = (change / monthly_avg[i+1]) * 100
    print(f"{months[i+1]} to {months[i]}: {change:.2f} units ({percent_change:.1f}%)")

In [None]:
# Relationship between consumption and Carpet Area (BUA)
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Carpet-Area', y='avg_consumption')
plt.title('Carpet Area vs Average Consumption')
plt.xlabel('Carpet Area (sq ft)')
plt.ylabel('Average Monthly Consumption (units)')

# Calculate correlation
correlation_bua = stats.pearsonr(df['Carpet-Area'], df['avg_consumption'])[0]
print(f"\nCorrelation between Carpet Area and consumption: {correlation_bua:.3f}")

# Fit a simple linear regression
X_bua = df['Carpet-Area'].values.reshape(-1, 1)
y_bua = df['avg_consumption'].values
reg_bua = LinearRegression().fit(X_bua, y_bua)
plt.plot(X_bua, reg_bua.predict(X_bua), color='red', linestyle='--')
plt.show()

# Print regression equation
print(f"Linear equation: Consumption = {reg_bua.coef_[0]:.2f} × Carpet-Area + {reg_bua.intercept_:.2f}")

In [None]:
# Relationship between consumption and household size
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='#Person', y='avg_consumption')
plt.title('Household Size vs Average Consumption')
plt.xlabel('Number of Persons')
plt.ylabel('Average Monthly Consumption (units)')
plt.show()

# Calculate average consumption per household size
household_consumption = df.groupby('#Person')['avg_consumption'].agg(['mean', 'std', 'count'])
print("\nConsumption by household size:")
print(household_consumption)

# Calculate correlation
correlation_size = stats.pearsonr(df['#Person'], df['avg_consumption'])[0]
print(f"\nCorrelation between household size and consumption: {correlation_size:.3f}")

In [None]:
# Create derived features
df['appliance_count'] = df['Types-of-appliances'].str.count(',') + 1
df['has_ac'] = df['Types-of-appliances'].str.contains('Air conditioner').astype(int)
df['has_heater'] = df['Types-of-appliances'].str.contains('Water Heater').astype(int)
df['area_per_person'] = df['Carpet-Area'] / df['#Person']

# Prepare enhanced feature set
X_enhanced = df[[
    'Carpet-Area', '#Person', 'appliance_count', 'has_ac', 
    'has_heater', 'area_per_person'
]]
y = df['avg_consumption']

# Split and train
X_train_enh, X_test_enh, y_train_enh, y_test_enh = train_test_split(
    X_enhanced, y, test_size=0.2, random_state=42
)

# Fit enhanced model
model_enhanced = LinearRegression()
model_enhanced.fit(X_train_enh, y_train_enh)

# Evaluate
y_pred_enh = model_enhanced.predict(X_test_enh)
r2_enh = r2_score(y_test_enh, y_pred_enh)
rmse_enh = np.sqrt(mean_squared_error(y_test_enh, y_pred_enh))

print("\nEnhanced Model Results:")
print(f"R-squared score: {r2_enh:.3f}")
print(f"RMSE: {rmse_enh:.3f}")
print("\nFeature importance:")
for feature, coef in zip(X_enhanced.columns, model_enhanced.coef_):
    print(f"{feature}: {coef:.3f}")

In [None]:
# Appliance analysis
plt.figure(figsize=(12, 6))
appliance_counts = df['Types-of-appliances'].str.split(',').explode().str.strip().value_counts()
sns.barplot(x=appliance_counts.values, y=appliance_counts.index)
plt.title('Distribution of Appliances')
plt.xlabel('Count')
plt.tight_layout()
plt.show()

In [None]:
# Consumption metrics
print("\nKey Consumption Metrics:")
print(f"Average consumption per person: {(df['avg_consumption'] / df['#Person']).mean():.2f} units")
print(f"Average area per person: {df['area_per_person'].mean():.2f} sq ft")
print(f"Most common household size: {df['#Person'].mode().values[0]} persons")
print(f"Percentage of homes with AC: {(df['has_ac'].mean() * 100):.1f}%")
print(f"Average consumption by AC homes: {df[df['has_ac'] == 1]['avg_consumption'].mean():.2f} units")
print(f"Average consumption by non-AC homes: {df[df['has_ac'] == 0]['avg_consumption'].mean():.2f} units")

In [None]:
# Calculate efficiency metrics
df['consumption_per_area'] = df['avg_consumption'] / df['Carpet-Area']
print(f"\nAverage consumption per square foot: {df['consumption_per_area'].mean():.3f} units")
print("\nMost efficient homes (lowest consumption per area):")
print(df.nsmallest(3, 'consumption_per_area')[['Name', 'Carpet-Area', 'avg_consumption', 'consumption_per_area']])
print("\nLeast efficient homes (highest consumption per area):")
print(df.nlargest(3, 'consumption_per_area')[['Name', 'Carpet-Area', 'avg_consumption', 'consumption_per_area']])

## Ridge Regression Model

In [None]:
# Consumption Distribution Analysis
plt.figure(figsize=(10, 6))
sns.histplot(df['avg_consumption'], bins=20, kde=True)
plt.title('Distribution of Average Monthly Consumption')
plt.xlabel('Average Monthly Consumption (units)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# Boxplot for outlier detection (Investigating Outliers)
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='avg_consumption')
plt.title('Boxplot of Average Monthly Consumption')
plt.xlabel('Average Monthly Consumption (units)')
plt.grid(True)
plt.show()


In [None]:
# Summary of key metrics
total_homes = df.shape[0]
avg_consumption = df['avg_consumption'].mean()
median_consumption = df['avg_consumption'].median()
max_consumption = df['avg_consumption'].max()
min_consumption = df['avg_consumption'].min()

print("\nSummary of Key Metrics:")
print(f"Total number of homes: {total_homes}")
print(f"Average consumption: {avg_consumption:.2f} units")
print(f"Median consumption: {median_consumption:.2f} units")
print(f"Maximum consumption: {max_consumption:.2f} units")
print(f"Minimum consumption: {min_consumption:.2f} units")