In [1]:
import pandas as pd
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind, f_oneway


# Loading the Data 
df = pd.read_csv('../data/processed/cleaned_subscriptions_churn.csv')
df.head()


Unnamed: 0,customer_id,gender,months_subscribed,streaming_quality,subscription_type,payment_mode,monthly_plan_cost,total_revenue,subscription_canceled,app_usage_hours,last30d_usage_hours,customer_rating,promo_email_clicks,device_type,num_profiles,auto_renew,support_tickets_last6m,nps_score,is_active_last30d
0,7590-VHVEG,Female,1,HD,Month-to-month,Electronic check,29.85,29.85,0,4.1,125.9,3,3,Mobile,1,1,0,9,1
1,5575-GNVDE,Male,34,HD,One year,Mailed check,56.95,1889.5,0,5.1,177.9,4,2,Mobile,1,0,0,7,1
2,3668-QPYBK,Male,2,HD,Month-to-month,Mailed check,53.85,108.15,1,2.4,88.5,5,4,Desktop,1,1,0,1,1
3,7795-CFOCW,Male,45,HD,One year,Bank transfer (automatic),42.3,1840.75,0,3.8,124.0,5,5,SmartTV,1,0,0,6,1
4,9237-HQITU,Female,2,4K,Month-to-month,Electronic check,70.7,151.65,1,6.3,182.8,5,2,Mobile,2,1,0,3,1


In [3]:
# ____________________________________________________________________________feature_scaling_analysis.py_________________________________________
# Run: python feature_scaling_analysis.py /path/to/cleaned_subscriptions_churn.xlsx

def analyze(path, out_dir="analysis_output", max_plots=12):
    os.makedirs(out_dir, exist_ok=True)
    df = pd.read_excel(path)

    n_rows, n_cols = df.shape
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

    summary_rows = []
    for col in numeric_cols:
        s = df[col].dropna().astype(float)
        cnt = s.size
        if cnt == 0:
            continue
        q1 = s.quantile(0.25); q3 = s.quantile(0.75); iqr = q3 - q1
        lower1 = q1 - 1.5*iqr; upper1 = q3 + 1.5*iqr
        lower3 = q1 - 3.0*iqr; upper3 = q3 + 3.0*iqr
        pct_out_15 = ((s < lower1) | (s > upper1)).sum() / cnt * 100
        pct_out_30 = ((s < lower3) | (s > upper3)).sum() / cnt * 100
        sk = float(skew(s)) if cnt>0 else np.nan
        kurt = float(kurtosis(s, fisher=False)) if cnt>0 else np.nan
        summary_rows.append({
            "feature": col,
            "count": cnt,
            "mean": s.mean(),
            "std": s.std(),
            "min": s.min(),
            "25%": q1,
            "50%": s.median(),
            "75%": q3,
            "max": s.max(),
            "skewness": sk,
            "kurtosis": kurt,
            "pct_outliers_1.5_iqr": round(pct_out_15,3),
            "pct_outliers_3.0_iqr": round(pct_out_30,3)
        })
    num_summary = pd.DataFrame(summary_rows).set_index('feature')

    # Recommendation function (same rules I would use)
    def recommend(row):
        pct = row['pct_outliers_1.5_iqr']
        sk = row['skewness']
        mn, mx = row['min'], row['max']
        if pd.notna(mn) and pd.notna(mx) and mn >= 0 and mx <= 1:
            return "Already 0-1 (no scaling) or MinMax if needed"
        if pd.notna(pct) and pct > 20:
            if pd.notna(sk) and sk > 1 and mn >= 0:
                return "Log1p (or PowerTransformer) then RobustScaler"
            return "RobustScaler"
        if pd.notna(sk) and abs(sk) > 1.0:
            if mn >= 0:
                return "Log1p or PowerTransformer then StandardScaler"
            return "PowerTransformer (Yeo-Johnson) then StandardScaler"
        if pd.notna(sk) and abs(sk) > 0.5:
            return "StandardScaler (consider PowerTransformer)"
        return "StandardScaler"

    if not num_summary.empty:
        num_summary['recommended_scaling'] = num_summary.apply(recommend, axis=1)

    # Save CSV + markdown report
    out_csv = os.path.join(out_dir, "numeric_feature_summary.csv")
    num_summary.to_csv(out_csv)

    md = os.path.join(out_dir, "feature_scaling_analysis_report.md")
    with open(md, "w", encoding="utf-8") as f:
        f.write("# Feature Scaling Analysis Report\n\n")
        f.write(f"Rows: {n_rows}  \nColumns: {n_cols}\n\n")
        f.write("## Numeric summary\n\n")
        f.write(num_summary.to_markdown())
        f.write("\n\n## Categorical columns (top categories)\n\n")
        for c in categorical_cols:
            f.write(f"### {c}\n")
            f.write(df[c].value_counts(dropna=False).head(10).to_markdown())
            f.write("\n\n")
        f.write("\n\n## Global recommendations\n")
        f.write("- Use StandardScaler for most gradient/linear models.\n")
        f.write("- Use RobustScaler for features with many outliers (>20% by 1.5*IQR).\n")
        f.write("- Use Log1p or PowerTransformer for highly skewed positive variables.\n")
        f.write("- Tree models (RF, XGBoost) do not need scaling usually.\n")

    # plots
    plot_dir = os.path.join(out_dir, "plots"); os.makedirs(plot_dir, exist_ok=True)
    for col in numeric_cols[:max_plots]:
        s = df[col].dropna().astype(float)
        plt.figure(figsize=(6,3.5)); plt.hist(s, bins=40); plt.title(f"Histogram: {col}"); plt.xlabel(col)
        plt.savefig(os.path.join(plot_dir, f"hist_{col}.png")); plt.close()
        plt.figure(figsize=(6,2)); plt.boxplot(s, vert=False); plt.title(f"Boxplot: {col}")
        plt.savefig(os.path.join(plot_dir, f"box_{col}.png")); plt.close()

    print("Analysis complete. Outputs in:", out_dir)
    print("CSV:", out_csv)
    print("Report:", md)
    print("Plots:", plot_dir)

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print("Usage: python feature_scaling_analysis.py /path/to/file.xlsx")
    else:
        analyze(sys.argv[1])

NameError: name 'sys' is not defined

In [4]:
#_____________________________________________________________________O U T L I E R __________________________________________________
# Identify numerical columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numerical Columns:", num_cols)

# Function to compute Modified Z-Score
def modified_z_score(series):
    median_val = series.median()
    mad = np.median(np.abs(series - median_val))
    if mad == 0:
        return np.zeros(len(series))
    return 0.6745 * (series - median_val) / mad

# Thresholds
IQR_multiplier = 1.5
MAD_threshold = 3.5
Z_threshold = 3

# Dictionary to store results
outlier_summary = {}

for col in num_cols:
    data = df[col].dropna()
    skewness = data.skew()
    
    if abs(skewness) < 0.5:
        method = "IQR + Z-Score"
        
        # IQR method
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1
        iqr_outliers = data[(data < Q1 - IQR_multiplier*IQR) | (data > Q3 + IQR_multiplier*IQR)]
        
        # Z-Score method
        mean_val = data.mean()
        std_val = data.std()
        z_outliers = data[np.abs((data - mean_val)/std_val) > Z_threshold]
        
        total_outliers = pd.Series(list(set(iqr_outliers.index) | set(z_outliers.index)))
        
    else:
        method = "Modified Z-Score (MAD)"
        mz = modified_z_score(data)
        total_outliers = data.index[np.abs(mz) > MAD_threshold]
    
    outlier_summary[col] = {
        "Skewness": round(skewness, 3),
        "Recommended Method": method,
        "Num Outliers": len(total_outliers)
    }

# Display summary
outlier_df = pd.DataFrame(outlier_summary).T.reset_index().rename(columns={"index": "Column"})
print(outlier_df)

Numerical Columns: ['months_subscribed', 'monthly_plan_cost', 'total_revenue', 'subscription_canceled', 'app_usage_hours', 'last30d_usage_hours', 'customer_rating', 'promo_email_clicks', 'num_profiles', 'auto_renew', 'support_tickets_last6m', 'nps_score', 'is_active_last30d']
                    Column Skewness      Recommended Method Num Outliers
0        months_subscribed     0.24           IQR + Z-Score            0
1        monthly_plan_cost   -0.221           IQR + Z-Score            0
2            total_revenue    0.964  Modified Z-Score (MAD)          137
3    subscription_canceled    1.063  Modified Z-Score (MAD)            0
4          app_usage_hours    0.043           IQR + Z-Score            0
5      last30d_usage_hours    0.501  Modified Z-Score (MAD)            0
6          customer_rating   -0.003           IQR + Z-Score            0
7       promo_email_clicks    0.558  Modified Z-Score (MAD)           25
8             num_profiles    1.326  Modified Z-Score (MAD)       