In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load datasets
df_countries = pd.read_csv('global_inflation_countries.csv')
df_organizations = pd.read_csv('global_inflation_organizations.csv')

# Merge datasets on country_code
merged_df = pd.merge(
    df_countries,
    df_organizations,
    on='country_code',
    how='left',
    suffixes=('', '_org')
)

# Drop duplicate or unnecessary columns
for col in merged_df.columns:
    if col.endswith('_org'):
        merged_df = merged_df.drop(col, axis=1)
    elif col.endswith('_x') or col.endswith('_y'):
        base_col = col[:-2]
        if f"{base_col}_x" in merged_df and f"{base_col}_y" in merged_df:
            merged_df[base_col] = merged_df[f"{base_col}_x"]
            merged_df = merged_df.drop([f"{base_col}_x", f"{base_col}_y"], axis=1)

# Convert year to datetime
merged_df['year'] = pd.to_datetime(merged_df['year'], format='%Y', errors='coerce')

# Add previous year inflation
merged_df['previous_inflation'] = merged_df.groupby('country_code')['inflation_rate'].shift(1)

# Compute 3-year rolling average
merged_df = merged_df.sort_values(by=["country_code", "year"])
merged_df['avg_inflation_3yr'] = (
    merged_df.groupby('country_code')['inflation_rate']
    .transform(lambda x: x.rolling(window=3, min_periods=3).mean())
)

# Save merged data
merged_df.to_csv('merged_inflation_data.csv', index=False)

# Drop rows with missing 3-year average
processed_df = merged_df.dropna(subset=['avg_inflation_3yr'])

# Save processed data
processed_df.to_csv('processed_inflation_data.csv', index=False)

# Define target and features
target = processed_df['avg_inflation_3yr']
features = processed_df.drop(columns=['inflation_rate', 'avg_inflation_3yr', 'country_name', 'year'], errors='ignore')

# Convert and clean
features = features.infer_objects(copy=False)
features = features.fillna(0)
features = pd.get_dummies(features)

# Combine features and target into one DataFrame
full_data = features.copy()
full_data['avg_inflation_3yr'] = target.values

# Split into train/val/test
train_data, test_data = train_test_split(full_data, test_size=0.3, random_state=1)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=1)

# Save combined datasets
train_data.to_csv('Train.csv', index=False)
val_data.to_csv('Validation.csv', index=False)
test_data.to_csv('Test.csv', index=False)
