# Poverty Mapping in Sumatra - Data Exploration

**Project**: Big Data Pipeline for Poverty Mapping in Sumatra  
**Team**: Kelompok 18  
**Objective**: Exploratory Data Analysis of poverty data across Sumatra provinces

## Table of Contents
1. [Data Loading and Overview](#data-loading)
2. [Data Quality Assessment](#data-quality)
3. [Descriptive Statistics](#descriptive-stats)
4. [Geographic Analysis](#geographic-analysis)
5. [Poverty Indicators Analysis](#poverty-indicators)
6. [Correlation Analysis](#correlation-analysis)
7. [Data Visualization](#visualization)
8. [Key Insights](#insights)

## 1. Data Loading and Setup {#data-loading}

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("📊 Libraries imported successfully!")

In [None]:
# Load the poverty data
data_path = '/data/Profil_Kemiskinan_Sumatera.csv'
df = pd.read_csv(data_path)

print(f"📈 Dataset loaded successfully!")
print(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Display basic information about the dataset
print("📋 Dataset Overview:")
print("=" * 50)
df.head(10)

In [None]:
# Data types and basic info
print("🔍 Data Types and Info:")
print("=" * 30)
df.info()
print("\n📊 Column Names:")
print(list(df.columns))

## 2. Data Quality Assessment {#data-quality}

In [None]:
# Check for missing values
print("🔍 Missing Values Analysis:")
print("=" * 40)
missing_data = df.isnull().sum()
missing_percentage = (missing_data / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Percentage': missing_percentage
})

missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)
print(missing_df)

if len(missing_df) == 0:
    print("✅ No missing values found!")

In [None]:
# Check for duplicates
duplicate_count = df.duplicated().sum()
print(f"🔄 Duplicate Records: {duplicate_count}")

if duplicate_count > 0:
    print(f"Percentage of duplicates: {(duplicate_count/len(df))*100:.2f}%")
else:
    print("✅ No duplicate records found!")

In [None]:
# Unique values in categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
print("📊 Unique Values in Categorical Columns:")
print("=" * 50)

for col in categorical_cols:
    unique_count = df[col].nunique()
    print(f"{col}: {unique_count} unique values")
    if unique_count <= 10:
        print(f"  Values: {list(df[col].unique())}")
    print()

## 3. Descriptive Statistics {#descriptive-stats}

In [None]:
# Descriptive statistics for numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns
print("📊 Descriptive Statistics:")
print("=" * 30)
df[numerical_cols].describe()

In [None]:
# Key poverty indicators summary
poverty_indicators = ['Persentase Kemiskinan (%)', 'Tingkat Pengangguran (%)', 'Jumlah Penduduk (jiwa)']

print("🎯 Key Poverty Indicators Summary:")
print("=" * 40)

for indicator in poverty_indicators:
    if indicator in df.columns:
        print(f"\n{indicator}:")
        print(f"  Mean: {df[indicator].mean():.2f}")
        print(f"  Median: {df[indicator].median():.2f}")
        print(f"  Min: {df[indicator].min():.2f}")
        print(f"  Max: {df[indicator].max():.2f}")
        print(f"  Std: {df[indicator].std():.2f}")

## 4. Geographic Analysis {#geographic-analysis}

In [None]:
# Province-level analysis
province_stats = df.groupby('Provinsi').agg({
    'Persentase Kemiskinan (%)': ['mean', 'min', 'max'],
    'Tingkat Pengangguran (%)': ['mean', 'min', 'max'],
    'Jumlah Penduduk (jiwa)': 'sum',
    'Kategori Kemiskinan': lambda x: x.mode().iloc[0] if not x.mode().empty else 'unknown'
}).round(2)

province_stats.columns = ['_'.join(col).strip() for col in province_stats.columns]
print("🗺️ Province-Level Statistics:")
print("=" * 35)
print(province_stats)

In [None]:
# Create province comparison visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Province-wise Poverty Analysis', fontsize=16, fontweight='bold')

# Poverty rate by province
province_poverty = df.groupby('Provinsi')['Persentase Kemiskinan (%)'].mean().sort_values(ascending=True)
axes[0, 0].barh(province_poverty.index, province_poverty.values, color='coral')
axes[0, 0].set_title('Average Poverty Rate by Province')
axes[0, 0].set_xlabel('Poverty Rate (%)')

# Unemployment rate by province
province_unemployment = df.groupby('Provinsi')['Tingkat Pengangguran (%)'].mean().sort_values(ascending=True)
axes[0, 1].barh(province_unemployment.index, province_unemployment.values, color='lightblue')
axes[0, 1].set_title('Average Unemployment Rate by Province')
axes[0, 1].set_xlabel('Unemployment Rate (%)')

# Population by province
province_population = df.groupby('Provinsi')['Jumlah Penduduk (jiwa)'].sum().sort_values(ascending=True)
axes[1, 0].barh(province_population.index, province_population.values/1000, color='lightgreen')
axes[1, 0].set_title('Total Population by Province')
axes[1, 0].set_xlabel('Population (Thousands)')

# Poverty category distribution
poverty_dist = df['Kategori Kemiskinan'].value_counts()
axes[1, 1].pie(poverty_dist.values, labels=poverty_dist.index, autopct='%1.1f%%', startangle=90)
axes[1, 1].set_title('Poverty Category Distribution')

plt.tight_layout()
plt.show()

## 5. Poverty Indicators Analysis {#poverty-indicators}

In [None]:
# Infrastructure access analysis
infrastructure_cols = ['Akses Pendidikan', 'Fasilitas Kesehatan', 'Akses Air Bersih']

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle('Infrastructure Access Analysis', fontsize=16, fontweight='bold')

for i, col in enumerate(infrastructure_cols):
    if col in df.columns:
        access_counts = df[col].value_counts()
        axes[i].pie(access_counts.values, labels=access_counts.index, autopct='%1.1f%%', startangle=90)
        axes[i].set_title(f'{col} Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Income group analysis
income_analysis = df.groupby('Golongan Pengeluaran').agg({
    'Persentase Kemiskinan (%)': 'mean',
    'Tingkat Pengangguran (%)': 'mean',
    'Jumlah Penduduk (jiwa)': 'sum'
}).round(2)

print("💰 Income Group Analysis:")
print("=" * 30)
print(income_analysis)

In [None]:
# Visualize relationship between income groups and poverty
plt.figure(figsize=(12, 8))

# Create scatter plot
for category in df['Kategori Kemiskinan'].unique():
    subset = df[df['Kategori Kemiskinan'] == category]
    plt.scatter(subset['Persentase Kemiskinan (%)'], subset['Tingkat Pengangguran (%)'], 
               label=category, alpha=0.6, s=60)

plt.xlabel('Poverty Rate (%)')
plt.ylabel('Unemployment Rate (%)')
plt.title('Relationship between Poverty and Unemployment by Category')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 6. Correlation Analysis {#correlation-analysis}

In [None]:
# Calculate correlation matrix for numerical variables
corr_matrix = df[numerical_cols].corr()

# Create correlation heatmap
plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', vmin=-1, vmax=1,
            center=0, square=True, fmt='.2f')
plt.title('Correlation Matrix of Numerical Variables')
plt.tight_layout()
plt.show()

print("🔗 Key Correlations:")
print("=" * 20)
# Find highest correlations
corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))

corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)
for var1, var2, corr in corr_pairs[:5]:
    print(f"{var1} ↔ {var2}: {corr:.3f}")

## 7. Advanced Data Visualization {#visualization}

In [None]:
# Interactive poverty map using Plotly
province_summary = df.groupby('Provinsi').agg({
    'Persentase Kemiskinan (%)': 'mean',
    'Tingkat Pengangguran (%)': 'mean',
    'Jumlah Penduduk (jiwa)': 'sum'
}).reset_index()

fig = px.scatter(province_summary, 
                x='Persentase Kemiskinan (%)', 
                y='Tingkat Pengangguran (%)',
                size='Jumlah Penduduk (jiwa)',
                color='Provinsi',
                hover_name='Provinsi',
                title='Province Poverty vs Unemployment Analysis',
                labels={'Persentase Kemiskinan (%)': 'Average Poverty Rate (%)',
                       'Tingkat Pengangguran (%)': 'Average Unemployment Rate (%)'})

fig.update_layout(height=600, showlegend=True)
fig.show()

In [None]:
# Distribution analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Distribution Analysis of Key Variables', fontsize=16, fontweight='bold')

# Poverty rate distribution
axes[0, 0].hist(df['Persentase Kemiskinan (%)'], bins=30, alpha=0.7, color='coral', edgecolor='black')
axes[0, 0].axvline(df['Persentase Kemiskinan (%)'].mean(), color='red', linestyle='--', label=f'Mean: {df["Persentase Kemiskinan (%)"].mean():.1f}')
axes[0, 0].set_title('Poverty Rate Distribution')
axes[0, 0].set_xlabel('Poverty Rate (%)')
axes[0, 0].legend()

# Unemployment rate distribution
axes[0, 1].hist(df['Tingkat Pengangguran (%)'], bins=30, alpha=0.7, color='lightblue', edgecolor='black')
axes[0, 1].axvline(df['Tingkat Pengangguran (%)'].mean(), color='blue', linestyle='--', label=f'Mean: {df["Tingkat Pengangguran (%)"].mean():.1f}')
axes[0, 1].set_title('Unemployment Rate Distribution')
axes[0, 1].set_xlabel('Unemployment Rate (%)')
axes[0, 1].legend()

# Population distribution (log scale)
axes[1, 0].hist(np.log10(df['Jumlah Penduduk (jiwa)']), bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
axes[1, 0].set_title('Population Distribution (Log Scale)')
axes[1, 0].set_xlabel('Log10(Population)')

# Consumption distribution
if 'Konsumsi (per kapita per minggu)' in df.columns:
    axes[1, 1].hist(df['Konsumsi (per kapita per minggu)'], bins=30, alpha=0.7, color='gold', edgecolor='black')
    axes[1, 1].set_title('Weekly Consumption Distribution')
    axes[1, 1].set_xlabel('Consumption per capita per week')

plt.tight_layout()
plt.show()

## 8. Key Insights and Recommendations {#insights}

In [None]:
# Generate key insights
print("🎯 KEY INSIGHTS FROM POVERTY MAPPING ANALYSIS")
print("=" * 60)

# Province with highest poverty
highest_poverty = df.groupby('Provinsi')['Persentase Kemiskinan (%)'].mean().idxmax()
highest_poverty_rate = df.groupby('Provinsi')['Persentase Kemiskinan (%)'].mean().max()
print(f"🔴 Highest Poverty Province: {highest_poverty} ({highest_poverty_rate:.1f}%)")

# Province with lowest poverty
lowest_poverty = df.groupby('Provinsi')['Persentase Kemiskinan (%)'].mean().idxmin()
lowest_poverty_rate = df.groupby('Provinsi')['Persentase Kemiskinan (%)'].mean().min()
print(f"🟢 Lowest Poverty Province: {lowest_poverty} ({lowest_poverty_rate:.1f}%)")

# Most populous area
most_populous = df.groupby('Provinsi')['Jumlah Penduduk (jiwa)'].sum().idxmax()
most_populous_count = df.groupby('Provinsi')['Jumlah Penduduk (jiwa)'].sum().max()
print(f"👥 Most Populous Province: {most_populous} ({most_populous_count:,} people)")

# Overall statistics
total_population = df['Jumlah Penduduk (jiwa)'].sum()
avg_poverty = df['Persentase Kemiskinan (%)'].mean()
avg_unemployment = df['Tingkat Pengangguran (%)'].mean()

print(f"\n📊 OVERALL STATISTICS:")
print(f"   Total Population: {total_population:,} people")
print(f"   Average Poverty Rate: {avg_poverty:.1f}%")
print(f"   Average Unemployment Rate: {avg_unemployment:.1f}%")

# Infrastructure access summary
print(f"\n🏗️ INFRASTRUCTURE ACCESS:")
for col in ['Akses Pendidikan', 'Fasilitas Kesehatan', 'Akses Air Bersih']:
    if col in df.columns:
        best_access = df[col].mode().iloc[0] if not df[col].mode().empty else 'unknown'
        best_access_pct = (df[col] == best_access).mean() * 100
        print(f"   {col}: {best_access} ({best_access_pct:.1f}% of areas)")

In [None]:
# Priority areas for intervention
print("\n🎯 PRIORITY AREAS FOR INTERVENTION:")
print("=" * 40)

# High poverty + high unemployment areas
high_poverty_threshold = df['Persentase Kemiskinan (%)'].quantile(0.75)
high_unemployment_threshold = df['Tingkat Pengangguran (%)'].quantile(0.75)

priority_areas = df[
    (df['Persentase Kemiskinan (%)'] > high_poverty_threshold) & 
    (df['Tingkat Pengangguran (%)'] > high_unemployment_threshold)
]

if not priority_areas.empty:
    priority_provinces = priority_areas['Provinsi'].value_counts().head(3)
    print("Top 3 provinces needing immediate intervention:")
    for i, (province, count) in enumerate(priority_provinces.items(), 1):
        avg_poverty = priority_areas[priority_areas['Provinsi'] == province]['Persentase Kemiskinan (%)'].mean()
        avg_unemployment = priority_areas[priority_areas['Provinsi'] == province]['Tingkat Pengangguran (%)'].mean()
        print(f"   {i}. {province}: {count} areas (Poverty: {avg_poverty:.1f}%, Unemployment: {avg_unemployment:.1f}%)")
else:
    print("No critical areas found based on current thresholds.")

In [None]:
# Data quality summary for pipeline
print("\n📋 DATA QUALITY SUMMARY FOR PIPELINE:")
print("=" * 45)
print(f"✅ Total records: {len(df):,}")
print(f"✅ Complete cases: {df.dropna().shape[0]:,} ({(df.dropna().shape[0]/len(df))*100:.1f}%)")
print(f"✅ Provinces covered: {df['Provinsi'].nunique()}")
print(f"✅ Commodities covered: {df['Komoditas'].nunique()}")
print(f"✅ Income groups: {df['Golongan Pengeluaran'].nunique()}")
print(f"\n🚀 Dataset is ready for ML pipeline processing!")