# EDA for GDP Forecasting Dataset (Extended)

## Import and Configuration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

TRAIN_FILE = "gdp_train.parquet"
TARGET = "NY.GDP.MKTP.CD"
LAG_FEATURES = [col for col in pd.read_parquet(TRAIN_FILE).columns if 'lag' in col]
TRADE_FEATURES = ['NE.EXP.GNFS.CD', 'NE.IMP.GNFS.CD', 'Trade_balance', 'Exports_to_Imports_ratio']

## Load Train Data

In [None]:
df = pd.read_parquet(TRAIN_FILE)
print(f"Train data shape: {df.shape}")
df.head()

In [None]:
df.sort_values(by=['country', 'date'], inplace=True) # SORT BY COUNTRY AND DATE

## Perform Analysis on Train Data

### GDP DISTRIBUTION

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(df[TARGET], bins=50, kde=True)
plt.title("Distribution of GDP")
plt.xlabel("GDP")
plt.ylabel("Count")
plt.show()

### GDP over time by country


In [None]:
plt.figure(figsize=(12,6))
for country, group in df.groupby("country"):
    plt.plot(group["date"], group[TARGET], label=country, alpha=0.6)
plt.title("GDP over time by country")
plt.xlabel("Year")
plt.ylabel("GDP")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
plt.show()

### CALCULATE GDP GROWTH RATE

In [None]:
df['GDP_growth'] = df.groupby('country')[TARGET].pct_change() * 100

plt.figure(figsize=(12,6))
for country, group in df.groupby('country'):
    plt.plot(group["date"], group["GDP_growth"], label=country, alpha=0.6)
plt.title("GDP Growth Rate (%) by Country")
plt.xlabel("Year")
plt.ylabel("Growth Rate (%)")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
plt.show()

### LAGGED GDP CORRELATION ANALYSIS


In [None]:
print("Correlation of lagged GDP features with target GDP:")
for lag_col in LAG_FEATURES:
    if lag_col in df.columns:
        corr = df[lag_col].corr(df[TARGET])
        print(f"{lag_col}: correlation = {corr:.3f}")

### TRADE FEATURES OVER TIME


In [None]:
for feat in TRADE_FEATURES:
    if feat in df.columns:
        plt.figure(figsize=(12,6))
        for country, group in df.groupby('country'):
            plt.plot(group['date'], group[feat], label=country, alpha=0.6)
        plt.title(f"{feat} over time by country")
        plt.xlabel("Year")
        plt.ylabel(feat)
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
        plt.show()

### FEATURE CORRELATION WITH TARGET


In [None]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols.remove(TARGET)
corr_matrix = df[numeric_cols + [TARGET]].corr()
target_corr = corr_matrix[TARGET].sort_values(ascending=False)
top_pos_corr = target_corr.head(10)
top_neg_corr = target_corr.tail(10)
print("Top positive correlated features with GDP:")
print(top_pos_corr)
print("\nTop negative correlated features with GDP:")
print(top_neg_corr)

### HEATMAP OF TOP CORRELATED FEATURES


In [None]:
top_features = list(top_pos_corr.index) + list(top_neg_corr.index)
plt.figure(figsize=(10,8))
sns.heatmap(df[top_features + [TARGET]].corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap of Top Features with GDP")
plt.show()

### SCATTER PLOTS OF TOP FEATURES VS GDP

In [None]:
for col in top_features:
    plt.figure(figsize=(6,4))
    sns.scatterplot(x=df[col], y=df[TARGET])
    plt.title(f"{col} vs {TARGET}")
    plt.xlabel(col)
    plt.ylabel(TARGET)
    plt.show()

### MISSING VALUES HEATMAP

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(df.isnull(), cbar=False)
plt.title("Missing Values Heatmap")
plt.show()

### SKEWNESS OF NUMERIC FEATURES

In [None]:
skewness = df[numeric_cols].skew().sort_values(ascending=False)
print("Top skewed numeric features:")
print(skewness.head(10))