# Waze User Churn Analysis Project

In [None]:
# Import necessary libraries
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, mean_squared_error, r2_score
from imblearn.over_sampling import SMOTE
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, cross_val_score


In [None]:
# Load the dataset
df=pd.read_csv("waze_dataset - waze_dataset.csv",encoding='unicode_escape')
df

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [63]:
df.duplicated().sum()

0

In [None]:
# Fill missing values in categorical columns with mode
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
df[categorical_columns] = df[categorical_columns].apply(lambda x: x.fillna(x.mode()[0]))


In [None]:
# Remove outliers using IQR method
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

numerical_columns = df.select_dtypes(include=[np.number]).columns
for column in numerical_columns:
    df = remove_outliers(df, column)


In [None]:
# Encode categorical variables
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])
df['device'] = le.fit_transform(df['device'])


In [None]:
# Normalize numerical columns
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

print(df.head())

In [None]:
# Analyze churn rate
churn_rate = df['label'].value_counts(normalize=True)
plt.figure(figsize=(8, 6))
churn_rate.plot(kind='bar')
plt.title('Churn Rate')
plt.ylabel('Percentage')
plt.show()

In [None]:
# Box plots for numerical features vs label
plt.figure(figsize=(15, 10))
sns.boxplot(x='label', y='total_sessions', data=df)
plt.title('Total Sessions by Label')
plt.show()

In [None]:
# Histograms for numerical features
plt.figure(figsize=(15, 10))
df[numerical_columns].hist(figsize=(15, 10))
plt.tight_layout()
plt.show()


In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Correlation with target variable
correlation_with_label = df.corr()['label'].sort_values(ascending=False)
print("Correlation with label:\n", correlation_with_label)


In [93]:
# Feature importance using Random Forest
X = df.drop(['ID', 'label'], axis=1)
y = df['label']


In [97]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': rf.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nFeature Importance:\n", feature_importance)




Feature Importance:
                     feature  importance
3   n_days_after_onboarding    0.127967
7   duration_minutes_drives    0.113190
2            total_sessions    0.110245
6          driven_km_drives    0.109571
8             activity_days    0.104128
4    total_navigations_fav1    0.091381
9              driving_days    0.089445
1                    drives    0.087179
0                  sessions    0.086307
5    total_navigations_fav2    0.065378
10                   device    0.015210


In [99]:
# Select top features
selected_features = feature_importance['feature'][:5].tolist() + ['label']
X_selected = df[selected_features]
print("\nSelected Features:", selected_features)


Selected Features: ['n_days_after_onboarding', 'duration_minutes_drives', 'total_sessions', 'driven_km_drives', 'activity_days', 'label']


In [101]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_selected.drop('label', axis=1), df['label'], test_size=0.2, random_state=42)


In [105]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [107]:
# Train the model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)


In [111]:
# Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.31      0.33      0.32       349
           1       0.88      0.87      0.87      1898

    accuracy                           0.79      2247
   macro avg       0.60      0.60      0.60      2247
weighted avg       0.79      0.79      0.79      2247



In [None]:
# Cross-validation
cv_scores = cross_val_score(rf_model, X_train_resampled, y_train_resampled, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean():.2f}")


In [None]:
# Hyperparameter tuning with RandomizedSearchCV
param_dist = {
    'n_estimators': np.arange(50, 200, 10),
    'max_depth': [None] + list(np.arange(5, 30, 5)),
    'min_samples_split': np.arange(2, 11)
}

random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=100, cv=5, n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_train_resampled, y_train_resampled)

print("Best parameters found:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)


In [None]:
# Final model evaluation
final_model = random_search.best_estimator_
final_y_pred = final_model.predict(X_test)
final_mse = mean_squared_error(y_test, final_y_pred)
final_r2 = r2_score(y_test, final_y_pred)


In [None]:
print(f"Final Model Mean Squared Error: {final_mse}")
print(f"Final Model R-squared: {final_r2}")
print("\nFinal Classification Report:")
print(classification_report(y_test, final_y_pred))