In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve

In [2]:
# Load the data
train_data = pd.read_csv('training.csv')
test_data = pd.read_csv('validation.csv')


In [3]:
# Handle missing values for numeric columns
numeric_cols = train_data.select_dtypes(include=[np.number]).columns
train_data[numeric_cols] = train_data[numeric_cols].fillna(train_data[numeric_cols].mean())
test_data[numeric_cols] = test_data[numeric_cols].fillna(test_data[numeric_cols].mean())

In [4]:
# Encode categorical variables
label_encoders = {}
for column in train_data.select_dtypes(include=['object']).columns:
    if column != 'CHURNED':
        print(f"Encode column: {column}")
        try:
            le = LabelEncoder()
            train_data[column] = le.fit_transform(train_data[column].astype(str))
            # Save the encoder
            label_encoders[column] = le
        except Exception as e:
            print(f"Error in encode column {column}: {e}")

Encode column: CUSTOMER_ID
Encode column: COLLEGE
Encode column: LESSTHAN600k
Encode column: REPORTED_SATISFACTION
Encode column: REPORTED_USAGE_LEVEL
Encode column: CONSIDERING_CHANGE_OF_PLAN


In [5]:
# Transform the test set
for column, le in label_encoders.items():
    if column in test_data.columns:
        print(f"Transform column: {column}")
        try:
            # Create a mapping for unseen labels
            test_data[column] = test_data[column].apply(lambda x: '<unknown>' if x not in le.classes_ else x)
            # Add '<unknown>' class to the encoder
            le.classes_ = np.append(le.classes_, '<unknown>')
            test_data[column] = le.transform(test_data[column].astype(str))
        except Exception as e:
            print(f"Error in transformation column {column}: {e}")

Transform column: CUSTOMER_ID
Transform column: COLLEGE
Transform column: LESSTHAN600k
Transform column: REPORTED_SATISFACTION
Transform column: REPORTED_USAGE_LEVEL
Transform column: CONSIDERING_CHANGE_OF_PLAN


In [6]:

# Drop CUSTOMER_ID before training and testing
X_train = train_data.drop(columns=['CHURNED', 'CUSTOMER_ID'])
X_test = test_data.drop(columns=['CUSTOMER_ID'])
y_train = train_data['CHURNED']

In [7]:

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [8]:
# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)


In [9]:
# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [10]:
# Validate the model
y_pred_proba = model.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_pred_proba)
print(f"ROC AUC Score: {roc_auc}")

ROC AUC Score: 0.8320454251637789


In [11]:

# Predict churn probabilities on the test set
test_pred_proba = model.predict_proba(X_test)[:, 1]

In [12]:
# Create a DataFrame with customer IDs and their churn probabilities
test_results = pd.DataFrame({
    'CUSTOMER_ID': test_data['CUSTOMER_ID'],
    'CHURN_PROBABILITY': test_pred_proba
})

In [13]:
# Define threshold for labeling churn (example: 0.5)
threshold = 0.5
test_results['CHURN_LABEL'] = test_results['CHURN_PROBABILITY'].apply(lambda x: 'LEAVE' if x >= threshold else 'STAY')

In [14]:
# Placeholder for base profit, replace with actual data if available
# Assuming 'Base_Profit' column in test_data represents the profit from each customer
test_data['Base_Profit'] = np.random.uniform(50, 200, size=test_data.shape[0])  # Example values, replace accordingly

In [15]:
# Calculate the optimal discount for each customer
def calculate_optimal_discount(churn_probability, base_profit, contact_cost=10):
    # Placeholder logic for discount calculation
    discount = min(churn_probability * 50, base_profit / 2)  # Example: max discount is 50% of base profit
    return discount

test_results['DISCOUNT'] = test_results.apply(
    lambda row: calculate_optimal_discount(row['CHURN_PROBABILITY'], test_data.loc[test_data['CUSTOMER_ID'] == row['CUSTOMER_ID'], 'Base_Profit'].values[0]),
    axis=1
)

In [16]:
# Define the strategy for contacting customers
# Example: contact top 20% customers with the highest churn probability
top_percent = 0.2
num_customers_to_contact = int(len(test_results) * top_percent)
test_results = test_results.sort_values(by='CHURN_PROBABILITY', ascending=False)
test_results['CLIENT_TO_CONTACT'] = ['YES' if i < num_customers_to_contact else 'NO' for i in range(len(test_results))]


In [17]:
# Save the results to a CSV file
test_results.to_csv('test.csv', index=False)

print("Results saved to test_results.csv")

Results saved to test_results.csv
