# City Happiness Index - Composite Indicator Project

In [None]:
import pandas as pd

# Corrected file path with raw string
df = pd.read_csv(r"Expanded_Happiness_Index_Data.csv")
df.head()


## Step 1: Data Cleaning and Imputation

In [None]:

# Check for missing values
df.isnull().sum()


## Step 2: Multivariate Analysis

In [None]:

import seaborn as sns
import matplotlib.pyplot as plt

# Correlation matrix
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix of Numeric Features")
plt.show()



## Step 7: Link to Existing Indices

To provide context and benchmark our custom Composite Happiness Index (CHI), we can compare it qualitatively with several globally recognized indices:

### 1. [UN Human Development Index (HDI)](http://hdr.undp.org/en/statistics/hdi/)
- Measures life expectancy, education level, and standard of living.
- Similar to our CHI, it reflects multidimensional well-being but on a national scale.
- Unlike HDI, our CHI adds urban environment metrics like **green space**, **air quality**, and **traffic density**, focusing specifically on **city-level quality of life**.

### 2. [Siemens Green City Index](https://assets.new.siemens.com/siemens/assets/api/uuid:fddc99e7-5907-49aa-92c4-610c0801659e/european-green-city-index.pdf)
- Assesses cities on sustainability: CO₂ emissions, energy use, buildings, and transport.
- Our **Environmental_Quality_Index** is inspired by this but includes **noise levels** and **AQI**, creating a more health-oriented indicator.

### 3. World Happiness Report
- Uses survey-based life evaluations and combines GDP, social support, healthy life expectancy, freedom, trust, and generosity.
- Our index replaces survey-based measures with **objective city data**, focusing on measurable urban attributes that influence well-being.

### Key Differences:
- Our CHI is designed to be **reproducible and data-driven**, using public urban indicators.
- It emphasizes **environmental and infrastructural quality**, which are often overlooked in national happiness scores.

These comparisons help validate the scope and relevance of our Composite Happiness Index in the global context.


## Step 3: Normalisation

In [None]:

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
normalized_cols = ['Air_Quality_Index', 'Decibel_Level', 'Green_Space_Area', 'Cost_of_Living_Index', 'Healthcare_Index']
df[[col + '_Norm' for col in normalized_cols]] = scaler.fit_transform(df[normalized_cols])
df.head()


## Step 4: Weighting and Aggregation

In [None]:

# Use provided sub-indices and create final composite index
df['Composite_Happiness_Index'] = (
    0.4 * df['Environmental_Quality_Index'] +
    0.35 * df['Health_Wellness_Index'] +
    0.25 * df['Mobility_Comfort_Index']
)
df[['City', 'Composite_Happiness_Index']].sort_values(by='Composite_Happiness_Index', ascending=False).head(10)


## Step 5: Cluster Analysis

In [None]:

from sklearn.cluster import KMeans

features = ['Environmental_Quality_Index', 'Health_Wellness_Index', 'Mobility_Comfort_Index']
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(df[features])

# Visualize clusters
sns.scatterplot(data=df, x='Environmental_Quality_Index', y='Health_Wellness_Index', hue='Cluster', palette='Set2')
plt.title("City Clusters Based on Sub-Indices")
plt.show()


## Step 6: Visualisation of Composite Index

In [None]:

# Bar plot of top 10 happiest cities
top_cities = df.sort_values(by='Composite_Happiness_Index', ascending=False).head(10)
plt.figure(figsize=(12, 6))
sns.barplot(data=top_cities, x='City', y='Composite_Happiness_Index', palette='viridis')
plt.title("Top 10 Happiest Cities (Composite Index)")
plt.xticks(rotation=45)
plt.show()


## Step 7: Link to Existing Indices


Refer to external indices like:
- [UN Human Development Index](http://hdr.undp.org/en/statistics/hdi/)
- [Siemens Green City Index](https://assets.new.siemens.com/siemens/assets/api/uuid:fddc99e7-5907-49aa-92c4-610c0801659e/european-green-city-index.pdf)

Discuss similarities and differences qualitatively in your report.


## Step 8: Distribution of Sub-Indices

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

# Plot distribution of each sub-index
sub_indices = ['Environmental_Quality_Index', 'Health_Wellness_Index', 'Mobility_Comfort_Index']
for col in sub_indices:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col], kde=True, bins=10)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()


## Step 9: Composite Index by Region (if Region Column Available)

In [None]:

if 'Region' in df.columns:
    region_avg = df.groupby('Region')['Composite_Happiness_Index'].mean().sort_values(ascending=False)
    region_avg.plot(kind='bar', figsize=(10, 6), title='Average Composite Index by Region')
    plt.ylabel('Composite Happiness Index')
    plt.show()
else:
    print("Region column not available in dataset.")


## Step 10: Pairplot for Sub-Indices

In [None]:

# Pairplot for visual relationship
sns.pairplot(df[['Environmental_Quality_Index', 'Health_Wellness_Index', 'Mobility_Comfort_Index', 'Composite_Happiness_Index']], diag_kind='kde')
plt.suptitle('Pairplot of Sub-Indices and Composite Index', y=1.02)
plt.show()


## Step 11: Save Cleaned and Processed Dataset

In [None]:
import os
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)
df.to_csv(os.path.join(output_dir, 'Final_Happiness_Index_Data_Processed.csv'), index=False)
print('Dataset saved to output/Final_Happiness_Index_Data_Processed.csv')

## Step 12: Composite Index Over Time for Selected Cities

In [None]:

# Line plot of Composite Index over time for a few cities
selected_cities = ['Berlin', 'Tokyo', 'New York', 'Paris']
df['Date'] = pd.to_datetime(df['Month'] + ' ' + df['Year'].astype(str))
plt.figure(figsize=(12, 6))
for city in selected_cities:
    city_data = df[df['City'] == city]
    plt.plot(city_data['Date'], city_data['Composite_Happiness_Index'], label=city)

plt.title("Composite Happiness Index Over Time (Selected Cities)")
plt.xlabel("Month")
plt.ylabel("Composite Happiness Index")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


## Step 13: Heatmap of City vs Month (Composite Index)

In [None]:

# Pivot table and heatmap
pivot = df.pivot_table(index='City', columns='Month', values='Composite_Happiness_Index')
plt.figure(figsize=(14, 18))
sns.heatmap(pivot, cmap='YlGnBu', annot=False)
plt.title("Heatmap of Composite Index by City and Month")
plt.xlabel("Month")
plt.ylabel("City")
plt.show()


## Step 14: Boxplot Comparison of Sub-Indices

In [None]:

plt.figure(figsize=(12, 6))
sns.boxplot(data=df[['Environmental_Quality_Index', 'Health_Wellness_Index', 'Mobility_Comfort_Index']])
plt.title("Boxplot Comparison of Sub-Indices")
plt.ylabel("Index Value")
plt.grid(True)
plt.show()


## Step 15: Scatterplot of Composite Index vs Air Quality

In [None]:

plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='Air_Quality_Index', y='Composite_Happiness_Index', hue='City', legend=False)
plt.title("Composite Happiness Index vs Air Quality Index")
plt.xlabel("Air Quality Index (Lower is Better)")
plt.ylabel("Composite Happiness Index")
plt.grid(True)
plt.show()


## Step 16: Comparison of Top and Bottom Cities by Composite Index

In [None]:

# Top 10 cities
top_10 = df.groupby('City')['Composite_Happiness_Index'].mean().sort_values(ascending=False).head(10)

# Bottom 10 cities
bottom_10 = df.groupby('City')['Composite_Happiness_Index'].mean().sort_values().head(10)

# Plot side by side
fig, axes = plt.subplots(1, 2, figsize=(18, 6), sharey=True)

sns.barplot(x=top_10.values, y=top_10.index, ax=axes[0], palette='Greens_r')
axes[0].set_title("Top 10 Cities by Average Composite Happiness Index")
axes[0].set_xlabel("Composite Happiness Index")

sns.barplot(x=bottom_10.values, y=bottom_10.index, ax=axes[1], palette='Reds_r')
axes[1].set_title("Bottom 10 Cities by Average Composite Happiness Index")
axes[1].set_xlabel("Composite Happiness Index")

plt.tight_layout()
plt.show()


## Step 17: Correlation of Sub-Indices with Composite Index

In [None]:

correlation = df[[
    'Environmental_Quality_Index', 
    'Health_Wellness_Index', 
    'Mobility_Comfort_Index', 
    'Composite_Happiness_Index'
]].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title("Correlation of Sub-Indices with Composite Index")
plt.show()


## Step 18: Average Sub-Indices for Top vs Bottom Cities

In [None]:

# Select top and bottom cities
top_cities = top_10.index.tolist()
bottom_cities = bottom_10.index.tolist()

# Compute average sub-indices
avg_top = df[df['City'].isin(top_cities)][['Environmental_Quality_Index', 'Health_Wellness_Index', 'Mobility_Comfort_Index']].mean()
avg_bottom = df[df['City'].isin(bottom_cities)][['Environmental_Quality_Index', 'Health_Wellness_Index', 'Mobility_Comfort_Index']].mean()

# Combine into one DataFrame
comparison_df = pd.DataFrame({'Top Cities': avg_top, 'Bottom Cities': avg_bottom})
comparison_df.plot(kind='bar', figsize=(10, 6), title='Average Sub-Indices: Top vs Bottom Cities')
plt.ylabel("Average Index Value")
plt.grid(True)
plt.tight_layout()
plt.show()
