In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries imported!")

✓ Libraries imported!


In [2]:
df = pd.read_csv('data/cleaned_data.csv')
print(f"Original Dataset: {df.shape}")
print(df.head())

Original Dataset: (7043, 20)
   gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  Female              0     Yes         No       1           No   
1    Male              0      No         No      34          Yes   
2    Male              0      No         No       2          Yes   
3    Male              0      No         No      45           No   
4  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity OnlineBackup  \
0  No phone service             DSL             No          Yes   
1                No             DSL            Yes           No   
2                No             DSL            Yes          Yes   
3  No phone service             DSL            Yes           No   
4                No     Fiber optic             No           No   

  DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
0               No          No          No              No  Month-to-month   
1  

In [4]:
print("\n" + "=" * 60)
print("CREATING NEW FEATURES")
print("=" * 60)

df['ChargesPerMonth'] = df['TotalCharges'] / (df['tenure'] + 1)  # +1 to avoid division by zero
print("✓ Created: ChargesPerMonth")

# Feature 2: Is New Customer (tenure < 6 months)
df['IsNewCustomer'] = (df['tenure'] < 6).astype(int)
print("✓ Created: IsNewCustomer")

# Feature 3: High Value Customer (MonthlyCharges > 70)
df['IsHighValue'] = (df['MonthlyCharges'] > 70).astype(int)
print("✓ Created: IsHighValue")

# Feature 4: Total Services (count of services customer has)
service_cols = ['PhoneService', 'InternetService', 'OnlineSecurity', 
                'OnlineBackup', 'DeviceProtection', 'TechSupport', 
                'StreamingTV', 'StreamingMovies']

df['TotalServices'] = 0
for col in service_cols:
    df['TotalServices'] += (df[col] == 'Yes').astype(int)
    
print("✓ Created: TotalServices")

# Feature 5: Has Tech Support
df['HasTechSupport'] = (df['TechSupport'] == 'Yes').astype(int)
print("✓ Created: HasTechSupport")

# Feature 6: Payment Risk (Electronic check = 1, others = 0)
df['PaymentRisk'] = (df['PaymentMethod'] == 'Electronic check').astype(int)
print("✓ Created: PaymentRisk")

# Feature 7: Contract Score (higher = better contract)
contract_mapping = {'Month-to-month': 0, 'One year': 1, 'Two year': 2}
df['ContractScore'] = df['Contract'].map(contract_mapping)
print("✓ Created: ContractScore")

print(f"\nNew shape after feature creation: {df.shape}")


CREATING NEW FEATURES
✓ Created: ChargesPerMonth
✓ Created: IsNewCustomer
✓ Created: IsHighValue
✓ Created: TotalServices
✓ Created: HasTechSupport
✓ Created: PaymentRisk
✓ Created: ContractScore

New shape after feature creation: (7043, 27)


In [5]:
print("\nNew Features Sample:")
new_features = ['ChargesPerMonth', 'IsNewCustomer', 'IsHighValue', 
                'TotalServices', 'HasTechSupport', 'PaymentRisk', 'ContractScore']
print(df[new_features].head(10))


New Features Sample:
   ChargesPerMonth  IsNewCustomer  IsHighValue  TotalServices  HasTechSupport  \
0        14.925000              1            0              1               0   
1        53.985714              0            0              3               0   
2        36.050000              1            0              3               0   
3        40.016304              0            0              3               1   
4        50.550000              1            1              1               0   
5        91.166667              0            1              4               0   
6        84.756522              0            1              3               0   
7        27.445455              0            0              1               0   
8       105.036207              0            1              5               1   
9        55.364286              0            0              3               0   

   PaymentRisk  ContractScore  
0            1              0  
1            0        

In [6]:
print("\n" + "=" * 60)
print("ENCODING CATEGORICAL VARIABLES")
print("=" * 60)

# Binary encoding for Yes/No columns
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']

for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})
    print(f"✓ Encoded: {col}")

# Encode gender
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})
print("✓ Encoded: gender")

# Encode target variable
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
print("✓ Encoded: Churn (target)")


ENCODING CATEGORICAL VARIABLES
✓ Encoded: Partner
✓ Encoded: Dependents
✓ Encoded: PhoneService
✓ Encoded: PaperlessBilling
✓ Encoded: gender
✓ Encoded: Churn (target)


In [7]:
print("\n" + "=" * 60)
print("ONE-HOT ENCODING")
print("=" * 60)

# Features that need one-hot encoding
ohe_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 
            'OnlineBackup', 'DeviceProtection', 'TechSupport', 
            'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']

df_encoded = pd.get_dummies(df, columns=ohe_cols, drop_first=True)

print(f"✓ One-hot encoding complete")
print(f"Shape after encoding: {df_encoded.shape}")


ONE-HOT ENCODING
✓ One-hot encoding complete
Shape after encoding: (7043, 38)


In [8]:
print("\nNew columns after one-hot encoding:")
new_cols = [col for col in df_encoded.columns if col not in df.columns]
print(new_cols[:10])  # Show first 10



New columns after one-hot encoding:
['MultipleLines_No phone service', 'MultipleLines_Yes', 'InternetService_Fiber optic', 'InternetService_No', 'OnlineSecurity_No internet service', 'OnlineSecurity_Yes', 'OnlineBackup_No internet service', 'OnlineBackup_Yes', 'DeviceProtection_No internet service', 'DeviceProtection_Yes']


In [9]:
print("\n" + "=" * 60)
print("FEATURE SCALING (StandardScaler)")
print("=" * 60)

# Separate features and target
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']

# Columns to scale
scale_cols = ['tenure', 'MonthlyCharges', 'TotalCharges', 'ChargesPerMonth', 
              'TotalServices', 'ContractScore']

# Initialize scaler
scaler = StandardScaler()

# Scale only numerical features
X[scale_cols] = scaler.fit_transform(X[scale_cols])

print("✓ Scaling complete")
print("\nScaled features sample:")
print(X[scale_cols].head())



FEATURE SCALING (StandardScaler)
✓ Scaling complete

Scaled features sample:
     tenure  MonthlyCharges  TotalCharges  ChargesPerMonth  TotalServices  \
0 -1.277445       -1.160323     -0.992611        -1.441115      -1.052777   
1  0.066327       -0.259629     -0.172165        -0.163684       0.031958   
2 -1.236724       -0.362660     -0.958066        -0.750249       0.031958   
3  0.514251       -0.746535     -0.193672        -0.620536       0.031958   
4 -1.236724        0.197365     -0.938874        -0.276045      -1.052777   

   ContractScore  
0      -0.828207  
1       0.371271  
2      -0.828207  
3       0.371271  
4      -0.828207  


In [10]:
print("\n" + "=" * 60)
print("FEATURE IMPORTANCE (Correlation with Churn)")
print("=" * 60)

# Calculate correlation with target
correlations = X.corrwith(y).abs().sort_values(ascending=False)

print("\nTop 15 features correlated with Churn:")
print(correlations.head(15))



FEATURE IMPORTANCE (Correlation with Churn)

Top 15 features correlated with Churn:
ContractScore                           0.396713
tenure                                  0.352229
IsNewCustomer                           0.308773
InternetService_Fiber optic             0.308020
Contract_Two year                       0.302253
PaymentMethod_Electronic check          0.301919
PaymentRisk                             0.301919
StreamingTV_No internet service         0.227890
OnlineBackup_No internet service        0.227890
OnlineSecurity_No internet service      0.227890
TechSupport_No internet service         0.227890
DeviceProtection_No internet service    0.227890
StreamingMovies_No internet service     0.227890
InternetService_No                      0.227890
IsHighValue                             0.203382
dtype: float64


In [11]:
top_n = 20
top_features = correlations.head(top_n).index.tolist()

X_selected = X[top_features]

print(f"\n✓ Selected top {top_n} features")
print(f"Final feature set shape: {X_selected.shape}")



✓ Selected top 20 features
Final feature set shape: (7043, 20)


In [12]:
print("\nFinal Selected Features:")
for i, feat in enumerate(top_features, 1):
    print(f"{i}. {feat} (correlation: {correlations[feat]:.3f})")



Final Selected Features:
1. ContractScore (correlation: 0.397)
2. tenure (correlation: 0.352)
3. IsNewCustomer (correlation: 0.309)
4. InternetService_Fiber optic (correlation: 0.308)
5. Contract_Two year (correlation: 0.302)
6. PaymentMethod_Electronic check (correlation: 0.302)
7. PaymentRisk (correlation: 0.302)
8. StreamingTV_No internet service (correlation: 0.228)
9. OnlineBackup_No internet service (correlation: 0.228)
10. OnlineSecurity_No internet service (correlation: 0.228)
11. TechSupport_No internet service (correlation: 0.228)
12. DeviceProtection_No internet service (correlation: 0.228)
13. StreamingMovies_No internet service (correlation: 0.228)
14. InternetService_No (correlation: 0.228)
15. IsHighValue (correlation: 0.203)
16. TotalCharges (correlation: 0.198)
17. MonthlyCharges (correlation: 0.193)
18. PaperlessBilling (correlation: 0.192)
19. Contract_One year (correlation: 0.178)
20. OnlineSecurity_Yes (correlation: 0.171)


In [13]:
df_final = pd.concat([X_selected, y], axis=1)
df_final.to_csv('data/processed_data.csv', index=False)
print("\n✓ Processed data saved to 'data/processed_data.csv'")

# Save feature names
pd.DataFrame({'feature': top_features}).to_csv('data/selected_features.csv', index=False)
print("✓ Feature names saved to 'data/selected_features.csv'")


✓ Processed data saved to 'data/processed_data.csv'
✓ Feature names saved to 'data/selected_features.csv'


In [14]:
import joblib
joblib.dump(scaler, 'models/scaler.pkl')
print("✓ Scaler saved to 'models/scaler.pkl'")

✓ Scaler saved to 'models/scaler.pkl'


In [15]:
print("\n" + "=" * 60)
print("FEATURE ENGINEERING SUMMARY")
print("=" * 60)
print(f"Original features: {df.shape[1]}")
print(f"After encoding: {df_encoded.shape[1]}")
print(f"After selection: {X_selected.shape[1]}")
print(f"\nFinal dataset: {df_final.shape[0]} rows × {df_final.shape[1]} columns")
print("=" * 60)


FEATURE ENGINEERING SUMMARY
Original features: 27
After encoding: 38
After selection: 20

Final dataset: 7043 rows × 21 columns
