In [5]:
import os 

os.listdir("inflation_excel")

['Armenia.xlsx',
 'Belarus.xlsx',
 'Estonia.xlsx',
 'Georgia.xlsx',
 'Kazakhstan.xlsx',
 'Kyrgyzstan.xlsx',
 'Moldova.xlsx',
 'Ukraine.xlsx',
 'Uzbekistan.xlsx']

In [4]:
import pandas as pd
excels = [['Armenia.xlsx',
 'Belarus.xlsx',
 'Estonia.xlsx',
 'Georgia.xlsx',
 'Kazakhstan.xlsx',
 'Kyrgyzstan.xlsx',
 'Moldova.xlsx',
 'Ukraine.xlsx',
 'Uzbekistan.xlsx']]

os.makedirs('inflation_csv', exist_ok=True)
for excel in excels:
    for file in excel:
        df = pd.read_excel('inflation_excel/' + file)
        df.to_csv('inflation_csv/' + file.split('.')[0] + '.csv', index=False)

In [6]:
os.listdir('inflation_csv')

['Armenia.csv',
 'Belarus.csv',
 'Estonia.csv',
 'Georgia.csv',
 'Kazakhstan.csv',
 'Kyrgyzstan.csv',
 'Moldova.csv',
 'Ukraine.csv',
 'Uzbekistan.csv']

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Step 1: Load and prepare data for each country
countries = []  # List to store dataframes for each country
country_names = []  # List to store country names

# Assuming you have 14 CSV files named as 'country_name.csv'
for country_file in os.listdir('inflation_csv'):
    country_name = country_file.split('.')[0]
    country_names.append(country_name)
    
    # Load data
    df = pd.read_csv('inflation_csv/' + country_file)
    
    # Convert date to datetime
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Sort by date
    df = df.sort_values('Date').reset_index(drop=True)
    
    # Add country name
    df['Country'] = country_name
    
    countries.append(df)

# Step 2: Feature engineering for time series
def create_features(df):
    # Create copy of dataframe
    df_featured = df.copy()
    
    # Extract date features
    df_featured['Year'] = df_featured['Date'].dt.year
    df_featured['Month'] = df_featured['Date'].dt.month
    df_featured['Quarter'] = df_featured['Date'].dt.quarter
    
    # Create lagged features (previous months' CPIs)
    for lag in range(1, 13):  # Create lags up to 12 months
        df_featured[f'CPI_lag_{lag}'] = df_featured['CPI'].shift(lag)
    
    # Create rolling mean features
    for window in [3, 6, 12]:
        df_featured[f'CPI_rolling_mean_{window}'] = df_featured['CPI'].rolling(window=window).mean()
    
    # Create rolling standard deviation (volatility)
    for window in [3, 6, 12]:
        df_featured[f'CPI_rolling_std_{window}'] = df_featured['CPI'].rolling(window=window).std()
    
    # Year-over-Year percentage change
    df_featured['CPI_YoY'] = df_featured['CPI'].pct_change(12) * 100
    
    # Month-over-Month percentage change
    df_featured['CPI_MoM'] = df_featured['CPI'].pct_change() * 100
    
    # Drop rows with NaN values (from lag operations)
    df_featured = df_featured.dropna()
    
    return df_featured

# Apply feature engineering to each country
countries_featured = []
for df in countries:
    countries_featured.append(create_features(df))

In [24]:
!pip install xgboost

^C


In [None]:
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor

# Step 3: Train-test split with time series consideration
def prepare_data_for_modeling(df, target_col='CPI', test_size=0.2):
    # Sort by date
    df = df.sort_values('Date')
    
    # Define split point for time-based validation
    split_idx = int(len(df) * (1 - test_size))
    
    # Split data
    train_df = df.iloc[:split_idx, :]
    test_df = df.iloc[split_idx:, :]
    
    # Define features and target
    feature_cols = [col for col in df.columns if col not in ['Date', target_col, 'Country']]
    
    X_train = train_df[feature_cols]
    y_train = train_df[target_col]
    X_test = test_df[feature_cols]
    y_test = test_df[target_col]
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test, test_df['Date'], scaler, feature_cols

# Define models to test
def get_models():
    models = {
        'Linear Regression': LinearRegression(),
        'ElasticNet': ElasticNet(alpha=0.5, l1_ratio=0.5),
        'SVR': SVR(kernel='rbf', C=100, gamma='auto'),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
        'XGBoost': XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    }
    return models

# Step 4: Evaluate models for each country
results = {}
best_models = {}
feature_importance = {}

for i, df in enumerate(countries_featured):
    country = country_names[i]
    print(f"\nModeling for {country}:")
    
    # Prepare data
    X_train, X_test, y_train, y_test, test_dates, scaler, feature_cols = prepare_data_for_modeling(df)
    
    # Get models
    models = get_models()
    country_results = {}
    
    # Train and evaluate each model
    for name, model in models.items():
        # Train
        model.fit(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_test)
        
        # Evaluate
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # Store results
        country_results[name] = {
            'MSE': mse,
            'RMSE': rmse,
            'MAE': mae,
            'R2': r2
        }
        
        print(f"{name} - RMSE: {rmse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")
    
    # Find best model for this country
    best_model_name = min(country_results, key=lambda x: country_results[x]['RMSE'])
    best_models[country] = models[best_model_name]
    
    # Extract feature importance if applicable
    if best_model_name in ['Random Forest', 'Gradient Boosting', 'XGBoost']:
        if hasattr(best_models[country], 'feature_importances_'):
            importances = best_models[country].feature_importances_
            feature_importance[country] = dict(zip(feature_cols, importances))
    
    results[country] = country_results


ModuleNotFoundError: No module named 'xgboost'