<a href="https://colab.research.google.com/github/rittikarijhwani/ML-lab-sem7/blob/main/exp_8_PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#ready
import pandas as pd

df = pd.read_csv('pakistan_hunger_data.csv')
df.head()

Unnamed: 0,City,Year,Population_Under_Poverty,Malnutrition_Rate,Food_Insecurity,Access_to_Clean_Water,Food_Production_Index,Children_Underweight
0,Quetta,2021,23.8,23.0,22.0,82.2,95.1,31.7
1,Hyderabad,2023,29.6,17.7,23.3,82.7,90.8,27.5
2,Rawalpindi,2020,25.3,23.0,24.4,89.0,99.6,25.8
3,Karachi,2021,25.4,19.3,20.6,87.5,98.3,25.4
4,Peshawar,2022,27.5,16.2,21.5,84.8,96.2,33.7


In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

df = pd.read_csv('pakistan_hunger_data.csv')

#features and target variable
X = df[['Population_Under_Poverty', 'Malnutrition_Rate', 'Food_Insecurity', 'Access_to_Clean_Water']]
y = df['Children_Underweight']

#standardizing the features - PCA works best with standardized data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

#explained variance of each principal component
explained_variance = pca.explained_variance_ratio_
print(f"Explained variance by each principal component: {explained_variance}")

# func to calculate all 3 metrics
def calculate_metrics(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, mae, r2

# Model 1: First 1 PC + Target Variable
X_model1 = X_pca[:, :1]  # First PC
X_train, X_test, y_train, y_test = train_test_split(X_model1, y, test_size=0.2, random_state=42) # 80-20 train test split
model1 = LinearRegression()
model1.fit(X_train, y_train) #fitting LR model to data
y_pred1 = model1.predict(X_test) #testing
mse1, mae1, r21 = calculate_metrics(y_test, y_pred1) #metric calculation
print(f"Model 1 - MSE: {mse1}, MAE: {mae1}, R²: {r21}")

# Model 2: First 2 PCs + Target Variable
X_model2 = X_pca[:, :2]  # First 2 PCs
X_train, X_test, y_train, y_test = train_test_split(X_model2, y, test_size=0.2, random_state=42)
model2 = LinearRegression()
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)
mse2, mae2, r22 = calculate_metrics(y_test, y_pred2)
print(f"Model 2 - MSE: {mse2}, MAE: {mae2}, R²: {r22}")

# Model 3: First 3 PCs + Target Variable
X_model3 = X_pca[:, :3]  # First 3 PCs
X_train, X_test, y_train, y_test = train_test_split(X_model3, y, test_size=0.2, random_state=42)
model3 = LinearRegression()
model3.fit(X_train, y_train)
y_pred3 = model3.predict(X_test)
mse3, mae3, r23 = calculate_metrics(y_test, y_pred3)
print(f"Model 3 - MSE: {mse3}, MAE: {mae3}, R²: {r23}")

# Model 4: All original features (no PCA)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
model4 = LinearRegression()
model4.fit(X_train, y_train)
y_pred4 = model4.predict(X_test)
mse4, mae4, r24 = calculate_metrics(y_test, y_pred4)
print(f"Model 4 - MSE: {mse4}, MAE: {mae4}, R²: {r24}")

#comparison table
comparison = pd.DataFrame({
    'Model': ['Model 1 (1 PC)', 'Model 2 (2 PCs)', 'Model 3 (3 PCs)', 'Model 4 (All Features)'],
    'MSE': [mse1, mse2, mse3, mse4],
    'MAE': [mae1, mae2, mae3, mae4],
    'R²': [r21, r22, r23, r24]
})

print("\n")
print(comparison)

Explained variance by each principal component: [0.26886609 0.25661272 0.24604389 0.2284773 ]
Model 1 - MSE: 8.70154909222199, MAE: 2.5253297939824444, R²: -0.03975135155226872
Model 2 - MSE: 8.70856552216576, MAE: 2.5240722064855436, R²: -0.040589747387284714
Model 3 - MSE: 8.76162203800755, MAE: 2.5419694030868625, R²: -0.04692949028484139
Model 4 - MSE: 8.840329136552988, MAE: 2.5620881421524455, R²: -0.05633423089160616
                    Model       MSE       MAE        R²
0          Model 1 (1 PC)  8.701549  2.525330 -0.039751
1         Model 2 (2 PCs)  8.708566  2.524072 -0.040590
2         Model 3 (3 PCs)  8.761622  2.541969 -0.046929
3  Model 4 (All Features)  8.840329  2.562088 -0.056334
