In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from statsmodels.tsa.arima.model import ARIMA
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv('enriched_account_information.csv')


date_columns = ['open_date', 'card_activation_date', 'date_in_collection']
for col in date_columns:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')


quarter_cols = [col for col in df.columns if any(q in col for q in ['2023Q', '2024Q', '2025Q'])]
spending_cols = [col for col in quarter_cols if 'SALE_total_spending' in col]



target_col = '2024Q4_SALE_total_spending'


if target_col not in df.columns:
    raise ValueError(f"Target column {target_col} not found in dataset")


def create_features(df, target_col):
    
    y = df[target_col].fillna(0)
    
    
    feature_sets = {
        
        'seasonal': pd.get_dummies(pd.Series(['Q4']), prefix='quarter'),
        
        
        'account': df[['age_days', 'activation_status_numeric', 'ebill_ind_numeric', 
                      'overlimit_type_flag_numeric', 'employee_code_numeric']],
        
        
        'payment': df[['delinquency_count', 'zero_balance_count', 'overlimit_count',
                      'payment_consistency_score', 'cycles_since_last_delinquency',
                      'delinquency_trend', 'delinquency_acceleration']],
        
        
        'financial': df[['utilization_rate', 'balance_to_credit_ratio', 
                        'cash_balance_to_cash_line_ratio', 'credit_grade_numeric',
                        'financial_stress_indicator', 'weighted_utilization']],
        
        
        'risk': df[['fraud_risk_score', 'high_delinquency_flag', 
                   'rapid_balance_change_flag', 'high_return_risk']],
        
        
        'cash_flow': df[['cash_balance', 'cash_flow_efficiency', 'cash_balance_trend',
                        'cash_to_debt_ratio', 'cash_buffer_ratio', 
                        'negative_cash_balance_flag', 'negative_cash_balance_frequency']],
        
        
        'lifecycle': pd.get_dummies(df['customer_lifecycle_stage'], prefix='lifecycle'),
        
        
        'sales_history': df[[f'sales_change_month_{i}_to_{i+1}' for i in range(1, 6)]],
    }
    
    
    lag_features = {}
    
    
    all_quarters = sorted([col.split('_')[0] for col in spending_cols])
    target_quarter = target_col.split('_')[0]
    previous_quarters = [q for q in all_quarters if q < target_quarter]
    
    
    for i, quarter in enumerate(previous_quarters[-4:], 1):
        col = f"{quarter}_SALE_total_spending"
        if col in df.columns:
            lag_features[f'lag_{i}_quarter'] = df[col].fillna(0)
    
    feature_sets['lags'] = pd.DataFrame(lag_features)
    
    
    growth_features = {}
    for i in range(len(previous_quarters)-1):
        q1 = previous_quarters[i]
        q2 = previous_quarters[i+1]
        col1 = f"{q1}_SALE_total_spending"
        col2 = f"{q2}_SALE_total_spending"
        if col1 in df.columns and col2 in df.columns:
            
            denominator = df[col1].replace(0, 0.001)
            growth_features[f'growth_{q1}_to_{q2}'] = (df[col2] - df[col1]) / denominator
    
    feature_sets['growth'] = pd.DataFrame(growth_features)
    
    
    rolling_features = {}
    
    
    if len(previous_quarters) >= 2:
        cols = [f"{q}_SALE_total_spending" for q in previous_quarters[-2:] if f"{q}_SALE_total_spending" in df.columns]
        if cols:
            rolling_features['last_2q_avg'] = df[cols].mean(axis=1)
    
    
    if len(previous_quarters) >= 4:
        cols = [f"{q}_SALE_total_spending" for q in previous_quarters[-4:] if f"{q}_SALE_total_spending" in df.columns]
        if cols:
            rolling_features['last_4q_avg'] = df[cols].mean(axis=1)
            rolling_features['last_4q_std'] = df[cols].std(axis=1)
    
    
    same_q_last_year = target_quarter.replace('2024', '2023')
    same_q_col = f"{same_q_last_year}_SALE_total_spending"
    if same_q_col in df.columns:
        rolling_features['same_q_last_year'] = df[same_q_col].fillna(0)
    
    feature_sets['rolling'] = pd.DataFrame(rolling_features)
    
    
    velocity_cols = [col for col in df.columns if 'velocity' in col and 'SALE' in col]
    accel_cols = [col for col in df.columns if 'acceleration' in col and 'SALE' in col]
    
    feature_sets['velocity'] = df[velocity_cols] if velocity_cols else pd.DataFrame()
    feature_sets['acceleration'] = df[accel_cols] if accel_cols else pd.DataFrame()
    
    
    X = pd.concat([df[['current_account_nbr']], *feature_sets.values()], axis=1)
    
    
    X = X.fillna(0)
    
    return X, y, feature_sets


X, y, feature_sets = create_features(df, target_col)


tscv = TimeSeriesSplit(n_splits=3)


scaler = StandardScaler()
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns

In [None]:

import numpy as np


inf_mask = np.isinf(X[numeric_cols])
if inf_mask.any().any():
    print(f"Found {inf_mask.sum().sum()} infinity values in the data")
    
    
    X[numeric_cols] = X[numeric_cols].replace([np.inf, -np.inf], np.nan)
    
    
    for col in numeric_cols:
        if X[col].isna().any():
            median_val = X[col].median()
            X[col] = X[col].fillna(median_val)
            print(f"Replaced NaN values in {col} with median: {median_val}")


max_allowed = np.finfo('float64').max / 1e10  
for col in numeric_cols:
    too_large = np.abs(X[col]) > max_allowed
    if too_large.any():
        print(f"Found {too_large.sum()} extremely large values in {col}")
        
        X.loc[X[col] > max_allowed, col] = max_allowed
        X.loc[X[col] < -max_allowed, col] = -max_allowed


X[numeric_cols] = scaler.fit_transform(X[numeric_cols])


def create_segments(df):
    segments = {}
    
    
    segments['new_accounts'] = pd.Series(df['age_days'] < 180, index=df.index)
    segments['established_accounts'] = pd.Series((df['age_days'] >= 180) & (df['age_days'] < 730), index=df.index)
    segments['mature_accounts'] = pd.Series(df['age_days'] >= 730, index=df.index)
    
    
    if 'last_4q_std' in df.columns:
        q75 = df['last_4q_std'].quantile(0.75)
        segments['high_volatility'] = pd.Series(df['last_4q_std'] > q75, index=df.index)
        segments['low_volatility'] = pd.Series(df['last_4q_std'] <= q75, index=df.index)
    
    
    if 'utilization_rate' in df.columns:
        segments['high_utilizers'] = pd.Series(df['utilization_rate'] > 70, index=df.index)
        segments['low_utilizers'] = pd.Series(df['utilization_rate'] <= 30, index=df.index)
        
    return segments

segments = create_segments(X)


def train_base_model(X_train, y_train):
    params = {
        'objective': 'reg:squarederror',
        'n_estimators': 200,
        'max_depth': 6,
        'learning_rate': 0.05,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 3,
        'reg_alpha': 0.1,
        'reg_lambda': 1.0,
        'enable_categorical': True
    }
    
    model = xgb.XGBRegressor(**params)
    model.fit(X_train.drop('current_account_nbr', axis=1), y_train)
    
    return model


def train_segment_models(X, y, segments):
    segment_models = {}
    
    for segment_name, segment_mask in segments.items():
        if segment_mask.sum() > 100:  
            X_segment = X[segment_mask]
            y_segment = y[segment_mask]
            
            model = train_base_model(X_segment, y_segment)
            segment_models[segment_name] = model
    
    return segment_models


def apply_time_series_adjustment(account_id, history, prediction):
    try:
        
        history = history.astype(float)
        
        
        if len(history) >= 4:
            
            model = ARIMA(history, order=(1, 0, 0))
            model_fit = model.fit()
            
            
            forecast = model_fit.forecast(steps=1)[0]
            
            
            
            volatility = history.std() / history.mean() if history.mean() > 0 else 0
            ml_weight = min(0.8, 0.5 + volatility)
            ts_weight = 1 - ml_weight
            
            adjusted_prediction = ml_weight * prediction + ts_weight * forecast
            
            return adjusted_prediction
        else:
            return prediction
    except:
        
        return prediction


def predict_with_ensemble(X_test, base_model, segment_models, segments, historical_data):
    
    base_preds = base_model.predict(X_test.drop('current_account_nbr', axis=1))
    
    
    final_preds = pd.Series(base_preds, index=X_test.index, name='predictions')
    
    
    for segment_name, model in segment_models.items():
        
        segment_mask = segments[segment_name].loc[X_test.index]
        
        if segment_mask.sum() > 0:
            
            segment_data = X_test.loc[segment_mask]
            segment_preds = model.predict(segment_data.drop('current_account_nbr', axis=1))
            
            
            segment_preds_series = pd.Series(segment_preds, index=segment_data.index)
            
            
            final_preds.loc[segment_mask] = 0.7 * segment_preds_series + 0.3 * base_preds[segment_mask]
    
    
    return final_preds.values



def create_historical_data(df, target_col):
    
    all_quarters = sorted([col.split('_')[0] for col in spending_cols])
    target_quarter = target_col.split('_')[0]
    previous_quarters = [q for q in all_quarters if q < target_quarter]
    
    
    history_cols = [f"{q}_SALE_total_spending" for q in previous_quarters if f"{q}_SALE_total_spending" in df.columns]
    historical_data = df[['current_account_nbr'] + history_cols].set_index('current_account_nbr')
    
    return historical_data

historical_data = create_historical_data(df, target_col)


cv_results = []
X['quarter_Q4'] = X['quarter_Q4'].map({'0': 0, 'True': 1})
for train_idx, test_idx in tscv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    
    fold_segments = {}
    for name, mask in segments.items():
        fold_segments[name] = mask.loc[X_train.index]
    
    
    segment_models = train_segment_models(X_train, y_train, fold_segments)
    
    
    test_segments = {}
    for name, mask in segments.items():
        test_segments[name] = mask.loc[X_test.index]
    
    y_pred = predict_with_ensemble(X_test, base_model, segment_models, test_segments, historical_data)
    
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    cv_results.append({
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2
    })


cv_df = pd.DataFrame(cv_results)
print("Cross-validation results:")
print(cv_df.mean())


final_base_model = train_base_model(X, y)
final_segment_models = train_segment_models(X, y, segments)


feature_importance = pd.DataFrame({
    'Feature': X.drop('current_account_nbr', axis=1).columns,
    'Importance': final_base_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 20 important features:")
print(feature_importance.head(20))


plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance.head(20))
plt.title('Top 20 Features by Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')


def forecast_q4_spending(new_data):
    
    X_new, _, _ = create_features(new_data, target_col)
    
    
    X_new[numeric_cols] = scaler.transform(X_new[numeric_cols])
    
    
    new_segments = create_segments(X_new)
    
    
    predictions = predict_with_ensemble(
        X_new, 
        final_base_model, 
        final_segment_models, 
        new_segments, 
        historical_data
    )
    
    
    results = pd.DataFrame({
        'current_account_nbr': new_data['current_account_nbr'],
        'predicted_q4_spending': predictions
    })
    
    return results


q4_forecast = forecast_q4_spending(df)
print("\nQ4 spending forecast summary:")
print(q4_forecast.describe())


q4_forecast.to_csv('q4_spending_forecast.csv', index=False)

print("\nForecasting complete. Results saved to 'q4_spending_forecast.csv'")