In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')

# Load the data
df = pd.read_csv('engineered_features.csv')

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

# Separate numeric columns (excluding COUNTRY)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nNumeric columns to analyze: {len(numeric_cols)}")

# ============================================
# 1. TEST FOR NON-LINEARITY
# ============================================

def test_linearity(data):
    """
    Tests for non-linearity using multiple methods:
    1. Shapiro-Wilk test for normality
    2. Skewness check
    3. Kurtosis check
    """
    results = {}
    
    for col in data.columns:
        results[col] = {
            'skewness': data[col].skew(),
            'kurtosis': data[col].kurtosis(),
            'shapiro_pvalue': stats.shapiro(data[col].dropna())[1] if len(data[col].dropna()) > 3 else np.nan,
            'needs_transform': False
        }
        
        # Flag if transformation needed (high skewness or non-normal distribution)
        if abs(results[col]['skewness']) > 1 or results[col]['shapiro_pvalue'] < 0.05:
            results[col]['needs_transform'] = True
    
    return pd.DataFrame(results).T

print("\n" + "="*60)
print("NON-LINEARITY ANALYSIS")
print("="*60)

linearity_results = test_linearity(df[numeric_cols])
print("\nLinearity Test Results:")
print(linearity_results.round(4))

print("\n\nFeatures that need transformation:")
transform_needed = linearity_results[linearity_results['needs_transform'] == True]
print(transform_needed[['skewness', 'kurtosis', 'shapiro_pvalue']])

# ============================================
# 2. APPLY TRANSFORMATIONS
# ============================================

def apply_transformations(data, columns_to_transform):
    """
    Applies various transformations and selects the best one for each feature.
    """
    transformed_data = data.copy()
    transformation_log = {}
    
    for col in columns_to_transform:
        original = data[col].values
        
        # Skip if all values are the same
        if original.std() == 0:
            transformation_log[col] = 'no_transform_constant'
            continue
        
        transformations = {}
        
        # 1. Log transformation (only for positive values)
        if (original > 0).all():
            try:
                log_trans = np.log(original)
                transformations['log'] = {
                    'data': log_trans,
                    'skewness': abs(stats.skew(log_trans))
                }
            except:
                pass
        
        # 2. Square root (only for non-negative values)
        if (original >= 0).all():
            try:
                sqrt_trans = np.sqrt(original)
                transformations['sqrt'] = {
                    'data': sqrt_trans,
                    'skewness': abs(stats.skew(sqrt_trans))
                }
            except:
                pass
        
        # 3. Box-Cox (only for positive values)
        if (original > 0).all():
            try:
                boxcox_trans, _ = stats.boxcox(original)
                transformations['boxcox'] = {
                    'data': boxcox_trans,
                    'skewness': abs(stats.skew(boxcox_trans))
                }
            except:
                pass
        
        # 4. Yeo-Johnson (works for all values)
        try:
            pt = PowerTransformer(method='yeo-johnson', standardize=False)
            yj_trans = pt.fit_transform(original.reshape(-1, 1)).flatten()
            transformations['yeo-johnson'] = {
                'data': yj_trans,
                'skewness': abs(stats.skew(yj_trans))
            }
        except:
            pass
        
        # 5. Reciprocal (for non-zero values)
        if (original != 0).all():
            try:
                recip_trans = 1 / original
                transformations['reciprocal'] = {
                    'data': recip_trans,
                    'skewness': abs(stats.skew(recip_trans))
                }
            except:
                pass
        
        # Select best transformation (lowest skewness)
        if transformations:
            best_method = min(transformations.keys(), key=lambda k: transformations[k]['skewness'])
            transformed_data[col] = transformations[best_method]['data']
            transformation_log[col] = best_method
        else:
            transformation_log[col] = 'no_transform_possible'
    
    return transformed_data, transformation_log

# Apply transformations to features that need it
cols_to_transform = transform_needed.index.tolist()

if len(cols_to_transform) > 0:
    df_transformed, transform_log = apply_transformations(df[numeric_cols], cols_to_transform)
    
    print("\n" + "="*60)
    print("TRANSFORMATION APPLIED")
    print("="*60)
    for col, method in transform_log.items():
        print(f"{col}: {method}")
    
    # Add country column back
    df_transformed.insert(0, 'COUNTRY', df['COUNTRY'])
    
    # Compare before and after
    print("\n" + "="*60)
    print("COMPARISON: BEFORE vs AFTER TRANSFORMATION")
    print("="*60)
    
    comparison = pd.DataFrame({
        'Feature': cols_to_transform,
        'Original_Skewness': [linearity_results.loc[col, 'skewness'] for col in cols_to_transform],
        'Transformed_Skewness': [df_transformed[col].skew() for col in cols_to_transform],
        'Transformation': [transform_log[col] for col in cols_to_transform]
    })
    print(comparison.round(4))
    
    # ============================================
    # 3. VISUALIZE TRANSFORMATIONS
    # ============================================
    
    # Plot distributions for a few examples (up to 4)
    cols_to_plot = cols_to_transform[:4]
    
    if len(cols_to_plot) > 0:
        fig, axes = plt.subplots(len(cols_to_plot), 2, figsize=(12, 4*len(cols_to_plot)))
        if len(cols_to_plot) == 1:
            axes = axes.reshape(1, -1)
        
        for idx, col in enumerate(cols_to_plot):
            # Original distribution
            axes[idx, 0].hist(df[col], bins=20, edgecolor='black', alpha=0.7)
            axes[idx, 0].set_title(f'{col} - Original (skew={df[col].skew():.2f})')
            axes[idx, 0].set_xlabel('Value')
            axes[idx, 0].set_ylabel('Frequency')
            
            # Transformed distribution
            axes[idx, 1].hist(df_transformed[col], bins=20, edgecolor='black', alpha=0.7, color='green')
            axes[idx, 1].set_title(f'{col} - Transformed ({transform_log[col]}, skew={df_transformed[col].skew():.2f})')
            axes[idx, 1].set_xlabel('Value')
            axes[idx, 1].set_ylabel('Frequency')
        
        plt.tight_layout()
        plt.savefig('transformation_comparison.png', dpi=300, bbox_inches='tight')
        print("\n✓ Visualization saved as 'transformation_comparison.png'")
        plt.show()
    
    # Save transformed data
    df_transformed.to_csv('transformed_features.csv', index=False)
    print("\n✓ Transformed data saved as 'transformed_features.csv'")
    
else:
    print("\nNo features require transformation!")

print("\n" + "="*60)
print("ANALYSIS COMPLETE")
print("="*60)