# üåç Global Pollution Analysis and Energy Recovery
## Machine Learning Assignment

**Objective:** Analyze global pollution data and build regression and logistic regression models to study energy recovery and pollution severity.

## Phase 1: Data Collection and Preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import (
    r2_score, mean_squared_error, mean_absolute_error,
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)

plt.rcParams['figure.figsize'] = (8,5)

### Step 1: Load Dataset

In [None]:
df = pd.read_csv('Global_Pollution_Analysis.csv')
df.head()

### Step 2: Handle Missing Values

In [None]:
df.isnull().sum()

In [None]:
df.fillna(df.mean(numeric_only=True), inplace=True)
df.isnull().sum()

### Step 3: Data Transformation

In [None]:
scaler = StandardScaler()
pollution_cols = ['Air_Pollution_Index', 'Water_Pollution_Index', 'Soil_Pollution_Index']
df[pollution_cols] = scaler.fit_transform(df[pollution_cols])

In [None]:
le = LabelEncoder()
df['Country'] = le.fit_transform(df['Country'])
df['Year'] = le.fit_transform(df['Year'])

## Phase 1: Exploratory Data Analysis (EDA)

### Descriptive Statistics

In [None]:
df.describe()

### Correlation Analysis

In [None]:
sns.heatmap(df.corr(), cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

### Visualizations

In [None]:
sns.boxplot(x=df['Air_Pollution_Index'])
plt.title('Air Pollution Distribution')
plt.show()

## Phase 1: Feature Engineering

In [None]:
df['Energy_Consumption_per_Capita'] = df['Energy_Consumption'] / df['Population']

## Phase 2: Linear Regression Model

In [None]:
X = df[['Air_Pollution_Index', 'CO2_Emissions', 'Industrial_Waste_in_tons']]
y = df['Energy_Recovery_GWh']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model_lr = LinearRegression()
model_lr.fit(X_train, y_train)
y_pred = model_lr.predict(X_test)

print('R2 Score:', r2_score(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('MAE:', mean_absolute_error(y_test, y_pred))

## Phase 2: Logistic Regression Model

In [None]:
df['Pollution_Level'] = pd.qcut(df['Air_Pollution_Index'], 3, labels=['Low','Medium','High'])

X = df[['Air_Pollution_Index', 'CO2_Emissions']]
y = df['Pollution_Level']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model_log = LogisticRegression(max_iter=1000)
model_log.fit(X_train, y_train)
y_pred = model_log.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, average='weighted'))
print('Recall:', recall_score(y_test, y_pred, average='weighted'))
print('F1 Score:', f1_score(y_test, y_pred, average='weighted'))

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.show()

## Phase 3: Insights and Recommendations

- Higher pollution levels significantly impact energy recovery.
- Countries with high industrial waste can benefit from pollution-to-energy strategies.
- Machine Learning models support data-driven environmental planning.