In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Step 1: Load your dataset
df = pd.read_csv(r'C:\Users\vlogs\OneDrive\Desktop\Churn Prediction\Data\cleaned_churn_data.csv')

# Step 2: Keep customerID separately for output
customer_ids = df['customerID']

# Step 3: Prepare target variable
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Step 4: Prepare features by dropping customerID and target
X = df.drop(['customerID', 'Churn'], axis=1)

# Step 5: Convert categorical variables to dummies (one-hot encoding)
X = pd.get_dummies(X, drop_first=True)

# Step 6: Target vector
y = df['Churn']

# Optional: Split data to train/test if you want evaluation
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Initialize and train Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)  # Use X_train, y_train if you split

# Step 8: Get churn probabilities for all customers
proba = rf.predict_proba(X)[:, 1]  # Probability of class '1' (churn)

# Step 9: Add probabilities and High-Risk flag to dataframe
result = pd.DataFrame({
    'customerID': customer_ids,
    'ChurnProbability': proba
})

# Define threshold for high risk customers (adjust threshold as needed)
threshold = 0.7
result['HighRisk'] = (result['ChurnProbability'] > threshold).astype(int)

# Step 10: Save results to CSV
result.to_csv('churn_probability.csv', index=False)

print("Done. File saved: churn_probability.csv")



Done. File saved: churn_probability.csv
