In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_csv('customer_churn_large_dataset.csv')

# Initial data exploration
print(data.info())
print(data.head())

# Handle missing data
data.fillna(method='ffill', inplace=True)  # Forward fill missing values

# Handle outliers (optional)
# You can use techniques like Z-score or IQR to identify and remove outliers

# Encode categorical variables
label_encoder = LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])
data['Location'] = label_encoder.fit_transform(data['Location'])

# Split the data into training and testing sets
X = data.drop(columns=['CustomerID', 'Name', 'Churn'])
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   CustomerID                  100000 non-null  int64  
 1   Name                        100000 non-null  object 
 2   Age                         100000 non-null  int64  
 3   Gender                      100000 non-null  object 
 4   Location                    100000 non-null  object 
 5   Subscription_Length_Months  100000 non-null  int64  
 6   Monthly_Bill                100000 non-null  float64
 7   Total_Usage_GB              100000 non-null  int64  
 8   Churn                       100000 non-null  int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 6.9+ MB
None
   CustomerID        Name  Age  Gender     Location  \
0           1  Customer_1   63    Male  Los Angeles   
1           2  Customer_2   62  Female     New York   
2           3  Custom

In [3]:
from sklearn.preprocessing import StandardScaler

# Example feature engineering: Calculate the ratio of Monthly_Bill to Subscription_Length_Months
X_train['Bill_Per_Month'] = X_train['Monthly_Bill'] / X_train['Subscription_Length_Months']
X_test['Bill_Per_Month'] = X_test['Monthly_Bill'] / X_test['Subscription_Length_Months']

# Apply feature scaling (if necessary)
scaler = StandardScaler()

# Scale numerical features (Age, Monthly_Bill, Total_Usage_GB, Bill_Per_Month)
numerical_features = ['Age', 'Monthly_Bill', 'Total_Usage_GB', 'Bill_Per_Month']
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Create and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 0.4999
Precision: 0.4957282987026685
Recall: 0.47374256627356115
F1 Score: 0.48448613544995367


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid with reduced options
param_dist = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier()

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    rf_classifier, param_distributions=param_dist,
    n_iter=10,  # Number of random parameter combinations to try
    cv=3,  # Cross-validation folds
    n_jobs=-1,  # Use all available CPU cores for parallel processing
    verbose=2,  # Increase verbosity for progress updates
    random_state=42  # Set a random seed for reproducibility
)

# Fit the randomized search to your data
random_search.fit(X_train, y_train)

# Get the best model and parameters
best_model = random_search.best_estimator_
best_params = random_search.best_params_

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)


Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [10]:
pip install joblib

Note: you may need to restart the kernel to use updated packages.


In [14]:
import joblib

# Save the model to a file
joblib.dump(model, 'churn_model.pkl')



['churn_model.pkl']