In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load data
df = pd.read_csv(r"C:\Users\user\Desktop\churn_prediction_project\data\Telco-Customer-Churn.csv")

# Clean TotalCharges
df['TotalCharges'] = df['TotalCharges'].replace(" ", np.nan).astype(float)
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# Create AvgMonthlySpend
df['AvgMonthlySpend'] = df['TotalCharges'] / (df['tenure'] + 1)

# ------------------------------
# 🧩 Outlier Capping + Summary Stats
# ------------------------------
num_col = df.select_dtypes(include=["float64", "int64"]).columns

# Exclude binary column(s)
binary_cols = ['SeniorCitizen']  
num_col = [col for col in num_col if col not in binary_cols]

# Cap outliers using IQR method
for col in num_col:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    df[col] = np.where(df[col] < lower, lower, df[col])
    df[col] = np.where(df[col] > upper, upper, df[col])

# Summary statistics
summary_stats = pd.DataFrame({
    'mean': df[num_col].mean(),
    'median': df[num_col].median(),
    'mode': df[num_col].mode().iloc[0],
    'std': df[num_col].std()
})
print("\n📈 Summary Statistics for Numerical Features:")
summary_stats.to_csv(r"C:\Users\user\Desktop\churn_prediction_project/data/summary_statistics.csv")

# ------------------------------
# Encode categorical features
# ------------------------------
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
for col in ['customerID', 'Churn']:
    if col in categorical_cols:
        categorical_cols.remove(col)

df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# ------------------------------
# Normalize numeric features
# ------------------------------
scaler = MinMaxScaler()
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlySpend']
df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])

# ------------------------------
# Save preprocessed data
# ------------------------------
df_encoded.to_csv(r"C:\Users\user\Desktop\churn_prediction_project\data\preprocessed_churn_data.csv", index=False)

print("✅ All categorical columns encoded and preprocessed data saved successfully.")



📈 Summary Statistics for Numerical Features:
✅ All categorical columns encoded and preprocessed data saved successfully.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
