<a href="https://www.kaggle.com/code/aktaruzzaman21/ride-sharing-customer-churn-prediction-using-ml?scriptVersionId=290705786" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Project Description

Customer churn is a major challenge for ride-sharing platforms. This project builds machine learning and neural network models to predict whether a customer will stop using a ride-sharing service based on historical usage and demographic features.

Objective:
Predict customer churn using ride-sharing customer data.

Models Used:

Logistic Regression

Random Forest Classifier

Artificial Neural Network (ANN)

# Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Load Dataset

Use a ride-sharing churn dataset containing customer usage information.

Example features:

Typical columns:

signup_date

last_trip_date

trips_in_first_30_days

avg_dist

avg_rating_by_driver

avg_rating_of_driver

surge_pct

weekday_pct

city

phone

churn

In [None]:
df = pd.read_csv("/kaggle/input/ride-sharing/churnnotclean.csv")
df.head()

# Exploratory Data Analysis

In [None]:
df.info()
df.describe()

Date Handling

In [None]:

df["signup_date"] = pd.to_datetime(df["signup_date"])
df["last_trip_date"] = pd.to_datetime(df["last_trip_date"])


df["days_since_signup"] = (df["last_trip_date"] - df["signup_date"]).dt.days

df["days_since_signup"].fillna(df["days_since_signup"].median(), inplace=True)

df.drop(["signup_date", "last_trip_date"], axis=1, inplace=True)


Check missing values:

In [None]:
df.isnull().sum()

# Data Cleaning

Fill missing values:

In [None]:
df["avg_rating_by_driver"].fillna(df["avg_rating_by_driver"].mean(), inplace=True)
df["avg_rating_of_driver"].fillna(df["avg_rating_of_driver"].mean(), inplace=True)

# Encoding Categorical Variables

In [None]:
label_cols = ["city", "phone"]

for col in label_cols:
    df[col] = LabelEncoder().fit_transform(df[col])


# Feature Selection

In [None]:
X = df.drop("churn", axis=1)
y = df["churn"]

# Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)


# Feature Scaling

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Logistic Regression Model

In [None]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


Confusion Matrix:

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred_lr), annot=True, fmt="d")
plt.title("Logistic Regression Confusion Matrix")
plt.show()

# Random Forest Classifier

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight="balanced"
)

rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Confusion Matrix:

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt="d")
plt.title("Random Forest Confusion Matrix")
plt.show()

# Artificial Neural Network (ANN)

In [None]:
ann_model = Sequential()

ann_model.add(Dense(64, activation="relu", input_dim=X_train.shape[1]))
ann_model.add(Dropout(0.3))
ann_model.add(Dense(32, activation="relu"))
ann_model.add(Dense(1, activation="sigmoid"))

ann_model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

history = ann_model.fit(
    X_train,
    y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)


Prediction and evaluation:

In [None]:
y_pred_ann = (ann_model.predict(X_test) > 0.5).astype(int)

print("ANN Accuracy:", accuracy_score(y_test, y_pred_ann))
print(classification_report(y_test, y_pred_ann))


# Model Performance Comparison

In [None]:
results = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest", "ANN"],
    "Accuracy": [
        accuracy_score(y_test, y_pred_lr),
        accuracy_score(y_test, y_pred_rf),
        accuracy_score(y_test, y_pred_ann)
    ]
})

results
