# Machine Learning for Poverty Prediction

**Project**: Big Data Pipeline for Poverty Mapping in Sumatra  
**Team**: Kelompok 18  
**Objective**: Build and evaluate machine learning models to predict poverty levels

## Table of Contents
1. [Data Preparation](#data-prep)
2. [Feature Engineering](#feature-engineering)
3. [Model Building](#model-building)
4. [Model Evaluation](#model-evaluation)
5. [Feature Importance Analysis](#feature-importance)
6. [Predictions and Insights](#predictions)
7. [Model Deployment Preparation](#deployment)

## 1. Data Preparation {#data-prep}

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

print("🤖 Machine Learning libraries imported successfully!")

In [None]:
# Load the data
df = pd.read_csv('/data/Profil_Kemiskinan_Sumatera.csv')
print(f"📊 Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

# Display basic info
print("\nColumns in dataset:")
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")

In [None]:
# Data preprocessing
print("🧹 Data Preprocessing...")

# Handle missing values
print(f"Missing values before: {df.isnull().sum().sum()}")
df_processed = df.dropna()  # For simplicity, dropping rows with missing values
print(f"Missing values after: {df_processed.isnull().sum().sum()}")
print(f"Rows remaining: {len(df_processed)} ({len(df_processed)/len(df)*100:.1f}%)")

# Create a copy for processing
df_ml = df_processed.copy()

## 2. Feature Engineering {#feature-engineering}

In [None]:
# Create poverty level categories for classification
def categorize_poverty(poverty_rate):
    if poverty_rate < 10:
        return 0  # Low
    elif poverty_rate < 20:
        return 1  # Medium
    else:
        return 2  # High

df_ml['poverty_level'] = df_ml['Persentase Kemiskinan (%)'].apply(categorize_poverty)

# Label encode categorical variables
label_encoders = {}
categorical_cols = ['Provinsi', 'Komoditas', 'Golongan Pengeluaran', 'Akses Pendidikan', 
                   'Fasilitas Kesehatan', 'Akses Air Bersih', 'Kategori Kemiskinan']

for col in categorical_cols:
    if col in df_ml.columns:
        le = LabelEncoder()
        df_ml[f'{col}_encoded'] = le.fit_transform(df_ml[col])
        label_encoders[col] = le

print("✅ Feature engineering completed")
print(f"New dataset shape: {df_ml.shape}")

In [None]:
# Create additional features
print("🔧 Creating additional features...")

# Population density proxy
df_ml['pop_log'] = np.log1p(df_ml['Jumlah Penduduk (jiwa)'])

# Unemployment to poverty ratio
df_ml['unemployment_poverty_ratio'] = df_ml['Tingkat Pengangguran (%)'] / (df_ml['Persentase Kemiskinan (%)'] + 1)

# Infrastructure score (combination of access variables)
infrastructure_map = {'baik': 3, 'sedang': 2, 'buruk': 1, 'memadai': 3, 'tidak memadai': 1, 'ya': 1, 'tidak': 0}

df_ml['education_score'] = df_ml['Akses Pendidikan'].map(infrastructure_map).fillna(0)
df_ml['health_score'] = df_ml['Fasilitas Kesehatan'].map({'memadai': 1, 'tidak memadai': 0}).fillna(0)
df_ml['water_score'] = df_ml['Akses Air Bersih'].map({'ya': 1, 'tidak': 0}).fillna(0)

df_ml['infrastructure_score'] = df_ml['education_score'] + df_ml['health_score'] + df_ml['water_score']

print("✅ Additional features created")

## 3. Model Building {#model-building}

In [None]:
# Prepare features for modeling
feature_cols = ['Tingkat Pengangguran (%)', 'Jumlah Penduduk (jiwa)', 'Konsumsi (per kapita per minggu)',
               'Provinsi_encoded', 'Golongan Pengeluaran_encoded', 'pop_log', 
               'unemployment_poverty_ratio', 'infrastructure_score']

# Filter features that exist in the dataset
available_features = [col for col in feature_cols if col in df_ml.columns]
print(f"Available features for modeling: {available_features}")

X = df_ml[available_features]
y_classification = df_ml['poverty_level']
y_regression = df_ml['Persentase Kemiskinan (%)']

print(f"Feature matrix shape: {X.shape}")
print(f"Target distribution (classification): {y_classification.value_counts().to_dict()}")

In [None]:
# Split data for classification
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X, y_classification, test_size=0.2, random_state=42, stratify=y_classification
)

# Split data for regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X, y_regression, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_clf_scaled = scaler.fit_transform(X_train_clf)
X_test_clf_scaled = scaler.transform(X_test_clf)

X_train_reg_scaled = scaler.fit_transform(X_train_reg)
X_test_reg_scaled = scaler.transform(X_test_reg)

print("✅ Data split and scaled successfully")
print(f"Training set size: {X_train_clf.shape[0]}")
print(f"Test set size: {X_test_clf.shape[0]}")

In [None]:
# Build Classification Models
print("🔨 Building Classification Models...")

classifiers = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(random_state=42, probability=True)
}

clf_results = {}

for name, classifier in classifiers.items():
    print(f"\nTraining {name}...")
    
    # Train model
    if name == 'Random Forest':
        classifier.fit(X_train_clf, y_train_clf)
        y_pred = classifier.predict(X_test_clf)
    else:
        classifier.fit(X_train_clf_scaled, y_train_clf)
        y_pred = classifier.predict(X_test_clf_scaled)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test_clf, y_pred)
    precision = precision_score(y_test_clf, y_pred, average='weighted')
    recall = recall_score(y_test_clf, y_pred, average='weighted')
    f1 = f1_score(y_test_clf, y_pred, average='weighted')
    
    clf_results[name] = {
        'model': classifier,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'predictions': y_pred
    }
    
    print(f"Accuracy: {accuracy:.3f}")
    print(f"F1-Score: {f1:.3f}")

print("\n✅ Classification models trained successfully")

In [None]:
# Build Regression Models
print("🔨 Building Regression Models...")

regressors = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Linear Regression': LinearRegression(),
    'SVR': SVR()
}

reg_results = {}

for name, regressor in regressors.items():
    print(f"\nTraining {name}...")
    
    # Train model
    if name == 'Random Forest':
        regressor.fit(X_train_reg, y_train_reg)
        y_pred = regressor.predict(X_test_reg)
    else:
        regressor.fit(X_train_reg_scaled, y_train_reg)
        y_pred = regressor.predict(X_test_reg_scaled)
    
    # Calculate metrics
    mse = mean_squared_error(y_test_reg, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test_reg, y_pred)
    
    reg_results[name] = {
        'model': regressor,
        'mse': mse,
        'rmse': rmse,
        'r2': r2,
        'predictions': y_pred
    }
    
    print(f"RMSE: {rmse:.3f}")
    print(f"R²: {r2:.3f}")

print("\n✅ Regression models trained successfully")

## 4. Model Evaluation {#model-evaluation}

In [None]:
# Classification Results Comparison
print("📊 CLASSIFICATION MODELS COMPARISON")
print("=" * 50)

clf_comparison = pd.DataFrame({
    'Model': list(clf_results.keys()),
    'Accuracy': [clf_results[model]['accuracy'] for model in clf_results],
    'Precision': [clf_results[model]['precision'] for model in clf_results],
    'Recall': [clf_results[model]['recall'] for model in clf_results],
    'F1-Score': [clf_results[model]['f1'] for model in clf_results]
})

print(clf_comparison.round(3))

# Find best classification model
best_clf_model = clf_comparison.loc[clf_comparison['F1-Score'].idxmax(), 'Model']
print(f"\n🏆 Best Classification Model: {best_clf_model}")

In [None]:
# Regression Results Comparison
print("📊 REGRESSION MODELS COMPARISON")
print("=" * 40)

reg_comparison = pd.DataFrame({
    'Model': list(reg_results.keys()),
    'MSE': [reg_results[model]['mse'] for model in reg_results],
    'RMSE': [reg_results[model]['rmse'] for model in reg_results],
    'R²': [reg_results[model]['r2'] for model in reg_results]
})

print(reg_comparison.round(3))

# Find best regression model
best_reg_model = reg_comparison.loc[reg_comparison['R²'].idxmax(), 'Model']
print(f"\n🏆 Best Regression Model: {best_reg_model}")

In [None]:
# Detailed evaluation of best models
print(f"\n🔍 DETAILED EVALUATION: {best_clf_model} (Classification)")
print("=" * 60)

best_clf = clf_results[best_clf_model]
y_pred_best_clf = best_clf['predictions']

# Classification report
target_names = ['Low Poverty', 'Medium Poverty', 'High Poverty']
print(classification_report(y_test_clf, y_pred_best_clf, target_names=target_names))

# Confusion matrix
cm = confusion_matrix(y_test_clf, y_pred_best_clf)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_names, yticklabels=target_names)
plt.title(f'Confusion Matrix - {best_clf_model}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# Regression model evaluation
print(f"\n🔍 DETAILED EVALUATION: {best_reg_model} (Regression)")
print("=" * 55)

best_reg = reg_results[best_reg_model]
y_pred_best_reg = best_reg['predictions']

# Scatter plot of predictions vs actual
plt.figure(figsize=(10, 6))

plt.subplot(1, 2, 1)
plt.scatter(y_test_reg, y_pred_best_reg, alpha=0.6)
plt.plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], 'r--', lw=2)
plt.xlabel('Actual Poverty Rate (%)')
plt.ylabel('Predicted Poverty Rate (%)')
plt.title(f'Predictions vs Actual - {best_reg_model}')
plt.grid(True, alpha=0.3)

# Residuals plot
plt.subplot(1, 2, 2)
residuals = y_test_reg - y_pred_best_reg
plt.scatter(y_pred_best_reg, residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Poverty Rate (%)')
plt.ylabel('Residuals')
plt.title('Residuals Plot')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Model Performance:")
print(f"  RMSE: {best_reg['rmse']:.3f}")
print(f"  R²: {best_reg['r2']:.3f}")
print(f"  Mean Absolute Error: {np.mean(np.abs(residuals)):.3f}")

## 5. Feature Importance Analysis {#feature-importance}

In [None]:
# Feature importance for Random Forest models
if 'Random Forest' in clf_results:
    rf_clf = clf_results['Random Forest']['model']
    feature_importance_clf = pd.DataFrame({
        'feature': available_features,
        'importance': rf_clf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("🌳 FEATURE IMPORTANCE - CLASSIFICATION (Random Forest)")
    print("=" * 55)
    print(feature_importance_clf)
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_importance_clf, y='feature', x='importance', palette='viridis')
    plt.title('Feature Importance - Classification Model')
    plt.xlabel('Importance Score')
    plt.tight_layout()
    plt.show()

if 'Random Forest' in reg_results:
    rf_reg = reg_results['Random Forest']['model']
    feature_importance_reg = pd.DataFrame({
        'feature': available_features,
        'importance': rf_reg.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\n🌳 FEATURE IMPORTANCE - REGRESSION (Random Forest)")
    print("=" * 50)
    print(feature_importance_reg)
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_importance_reg, y='feature', x='importance', palette='plasma')
    plt.title('Feature Importance - Regression Model')
    plt.xlabel('Importance Score')
    plt.tight_layout()
    plt.show()

## 6. Predictions and Insights {#predictions}

In [None]:
# Generate insights from the best models
print("🎯 KEY INSIGHTS FROM MACHINE LEARNING MODELS")
print("=" * 55)

# Classification insights
poverty_distribution = pd.Series(y_test_clf).value_counts().sort_index()
prediction_distribution = pd.Series(y_pred_best_clf).value_counts().sort_index()

print("📊 Poverty Level Distribution:")
print(f"   Actual - Low: {poverty_distribution.get(0, 0)}, Medium: {poverty_distribution.get(1, 0)}, High: {poverty_distribution.get(2, 0)}")
print(f"   Predicted - Low: {prediction_distribution.get(0, 0)}, Medium: {prediction_distribution.get(1, 0)}, High: {prediction_distribution.get(2, 0)}")

# Model accuracy insights
print(f"\n🎯 Model Performance:")
print(f"   Classification Accuracy: {best_clf['accuracy']:.1%}")
print(f"   Regression R²: {best_reg['r2']:.3f}")
print(f"   Average Prediction Error: ±{best_reg['rmse']:.1f}%")

# Feature insights
if 'Random Forest' in clf_results:
    top_features = feature_importance_clf.head(3)
    print(f"\n🔑 Top 3 Most Important Features:")
    for i, (_, row) in enumerate(top_features.iterrows(), 1):
        print(f"   {i}. {row['feature']}: {row['importance']:.3f}")

In [None]:
# Predict poverty for different scenarios
print("\n🔮 POVERTY PREDICTION SCENARIOS")
print("=" * 40)

# Create sample scenarios
scenarios = {
    'Low Risk Area': {
        'Tingkat Pengangguran (%)': 5.0,
        'infrastructure_score': 4,
        'unemployment_poverty_ratio': 0.5
    },
    'Medium Risk Area': {
        'Tingkat Pengangguran (%)': 12.0,
        'infrastructure_score': 2,
        'unemployment_poverty_ratio': 0.8
    },
    'High Risk Area': {
        'Tingkat Pengangguran (%)': 20.0,
        'infrastructure_score': 1,
        'unemployment_poverty_ratio': 1.2
    }
}

# Make predictions for scenarios (simplified - using subset of features)
for scenario_name, scenario_data in scenarios.items():
    print(f"\n{scenario_name}:")
    print(f"  Unemployment Rate: {scenario_data['Tingkat Pengangguran (%)']}%")
    print(f"  Infrastructure Score: {scenario_data['infrastructure_score']}/5")
    print(f"  Risk Level: Based on feature analysis, this area shows {'HIGH' if scenario_data['unemployment_poverty_ratio'] > 1.0 else 'MEDIUM' if scenario_data['unemployment_poverty_ratio'] > 0.7 else 'LOW'} poverty risk")

## 7. Model Deployment Preparation {#deployment}

In [None]:
# Save model artifacts for deployment
import joblib
import json

print("💾 Preparing models for deployment...")

# Save best models
model_artifacts = {
    'best_classifier': {
        'model_name': best_clf_model,
        'model': clf_results[best_clf_model]['model'],
        'accuracy': clf_results[best_clf_model]['accuracy'],
        'features': available_features
    },
    'best_regressor': {
        'model_name': best_reg_model,
        'model': reg_results[best_reg_model]['model'],
        'r2_score': reg_results[best_reg_model]['r2'],
        'rmse': reg_results[best_reg_model]['rmse'],
        'features': available_features
    },
    'scaler': scaler,
    'label_encoders': label_encoders
}

# Create deployment summary
deployment_summary = {
    'model_info': {
        'classification_model': best_clf_model,
        'regression_model': best_reg_model,
        'features_used': available_features,
        'training_date': pd.Timestamp.now().isoformat(),
        'data_shape': list(df_ml.shape)
    },
    'performance': {
        'classification_accuracy': float(clf_results[best_clf_model]['accuracy']),
        'regression_r2': float(reg_results[best_reg_model]['r2']),
        'regression_rmse': float(reg_results[best_reg_model]['rmse'])
    },
    'feature_importance': feature_importance_clf.to_dict('records') if 'Random Forest' in clf_results else []
}

print("✅ Model artifacts prepared for deployment")
print(f"   Classification Model: {best_clf_model} (Accuracy: {clf_results[best_clf_model]['accuracy']:.1%})")
print(f"   Regression Model: {best_reg_model} (R²: {reg_results[best_reg_model]['r2']:.3f})")
print(f"   Features: {len(available_features)} features")
print(f"   Ready for Spark MLlib integration!")

In [None]:
# Model usage example
print("\n🚀 MODEL USAGE EXAMPLE")
print("=" * 30)
print("""# Example code for using the trained model in production:

import joblib
import pandas as pd

# Load the saved model
model = joblib.load('poverty_prediction_model.pkl')
scaler = joblib.load('feature_scaler.pkl')

# Prepare new data
new_data = pd.DataFrame({
    'Tingkat Pengangguran (%)': [15.0],
    'infrastructure_score': [2],
    # ... other features
})

# Scale features
new_data_scaled = scaler.transform(new_data)

# Make prediction
prediction = model.predict(new_data_scaled)
probability = model.predict_proba(new_data_scaled)

print(f'Predicted poverty level: {prediction[0]}')
print(f'Prediction confidence: {max(probability[0]):.2f}')
""")

print("\n✅ Machine Learning pipeline completed successfully!")
print("📊 Models are ready for integration with the big data pipeline.")