In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [4]:
# Load data
df = pd.read_csv('../data/Telco_customer_churn.csv')

In [7]:
# Convert column names to snake case
def convert_to_snake_case(name):
    # Remove spaces and convert to lowercase
    return name.lower().replace(' ', '_')

# Rename all columns
df.columns = [convert_to_snake_case(col) for col in df.columns]

In [8]:
# Print new column names
print("New Column Names:")
print(df.columns.tolist())

# Basic exploration
print("\nDataset Shape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())
print("\nData Types:\n", df.dtypes)

New Column Names:
['customerid', 'count', 'country', 'state', 'city', 'zip_code', 'lat_long', 'latitude', 'longitude', 'gender', 'senior_citizen', 'partner', 'dependents', 'tenure_months', 'phone_service', 'multiple_lines', 'internet_service', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies', 'contract', 'paperless_billing', 'payment_method', 'monthly_charges', 'total_charges', 'churn_label', 'churn_value', 'churn_score', 'cltv', 'churn_reason']

Dataset Shape: (7043, 33)

Missing Values:
 customerid              0
count                   0
country                 0
state                   0
city                    0
zip_code                0
lat_long                0
latitude                0
longitude               0
gender                  0
senior_citizen          0
partner                 0
dependents              0
tenure_months           0
phone_service           0
multiple_lines          0
internet_service        0
onli

In [9]:
# Analyze target variables (we have multiple churn-related columns)
print("\nChurn Distribution:")
print("Churn Label:\n", df['churn_label'].value_counts(normalize=True))
print("\nChurn Value:\n", df['churn_value'].value_counts(normalize=True))
print("\nChurn Score Distribution:")
print(df['churn_score'].describe())


Churn Distribution:
Churn Label:
 churn_label
No     0.73463
Yes    0.26537
Name: proportion, dtype: float64

Churn Value:
 churn_value
0    0.73463
1    0.26537
Name: proportion, dtype: float64

Churn Score Distribution:
count    7043.000000
mean       58.699418
std        21.525131
min         5.000000
25%        40.000000
50%        61.000000
75%        75.000000
max       100.000000
Name: churn_score, dtype: float64


In [10]:
# Categorical variables analysis
categorical_cols = df.select_dtypes(include=['object']).columns
print("\nCategorical Columns:", categorical_cols.tolist())

# Numerical variables analysis
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
print("\nNumerical Columns:", numerical_cols.tolist())


Categorical Columns: ['customerid', 'country', 'state', 'city', 'lat_long', 'gender', 'senior_citizen', 'partner', 'dependents', 'phone_service', 'multiple_lines', 'internet_service', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies', 'contract', 'paperless_billing', 'payment_method', 'total_charges', 'churn_label', 'churn_reason']

Numerical Columns: ['count', 'zip_code', 'latitude', 'longitude', 'tenure_months', 'monthly_charges', 'churn_value', 'churn_score', 'cltv']
