In [35]:
# Predict if a customer will leave ("churn") based on their personal and account data.
import pandas as pd
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [36]:
df = pd.read_csv("telco-customer-churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [38]:
df.shape

(7043, 21)

In [39]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [40]:
df['TotalCharges']

0         29.85
1        1889.5
2        108.15
3       1840.75
4        151.65
         ...   
7038     1990.5
7039     7362.9
7040     346.45
7041      306.6
7042     6844.5
Name: TotalCharges, Length: 7043, dtype: object

In [41]:
# Convert 'TotalCharges' to numeric (invalid parsing will be set as NaN)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [42]:
df['TotalCharges']

0         29.85
1       1889.50
2        108.15
3       1840.75
4        151.65
         ...   
7038    1990.50
7039    7362.90
7040     346.45
7041     306.60
7042    6844.50
Name: TotalCharges, Length: 7043, dtype: float64

In [43]:
df['TotalCharges'].isnull().sum()

np.int64(11)

In [44]:
# Fill missing 'TotalCharges' values with the median
if df['TotalCharges'].isnull().any():
    median_total_charges = df['TotalCharges'].median()
    df['TotalCharges'] = df['TotalCharges'].fillna(median_total_charges)

In [45]:
# Drop 'customerID' column if it exists
if 'customerID' in df.columns:
    data = df.drop('customerID', axis=1, inplace=True)
else:
    print("Column 'customerID' not found")

In [46]:
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [47]:
# List of categorical columns
categorical_cols = [
    'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
        'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
            'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
                'PaperlessBilling', 'PaymentMethod'
                ]

In [48]:
# Check if all categorical columns exist before encoding
for col in categorical_cols:
    if col not in df.columns:
            print(f"Warning: {col} not found in data!")

In [49]:
df.shape

(7043, 20)

In [50]:
# Encode categorical variables
data_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [51]:
data_encoded.shape

(7043, 31)

In [52]:
# Encode target 'Churn' (Yes -> 1, No -> 0)
if 'Churn' in data_encoded.columns:
    data_encoded['Churn'] = data_encoded['Churn'].map({'No': 0, 'Yes': 1})
else:
     raise ValueError("Target column 'Churn' is missing from the dataset!")

In [53]:
data_encoded

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,False,True,False,False,True,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.50,0,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,True
2,0,2,53.85,108.15,1,True,False,False,True,False,...,False,False,False,False,False,False,True,False,False,True
3,0,45,42.30,1840.75,0,True,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False
4,0,2,70.70,151.65,1,False,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,24,84.80,1990.50,0,True,True,True,True,False,...,False,True,False,True,True,False,True,False,False,True
7039,0,72,103.20,7362.90,0,False,True,True,True,False,...,False,True,False,True,True,False,True,True,False,False
7040,0,11,29.60,346.45,0,False,True,True,False,True,...,False,False,False,False,False,False,True,False,True,False
7041,1,4,74.40,306.60,1,True,True,False,True,False,...,False,False,False,False,False,False,True,False,False,True


In [54]:
data_encoded.columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn',
       'gender_Male', 'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No internet service', 'OnlineBackup_Yes',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No internet service', 'StreamingMovies_Yes',
       'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')

In [55]:
# Separate features and target
X = data_encoded.drop('Churn', axis=1, )
y = data_encoded['Churn']

In [56]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
    )

In [57]:
# Scale numerical features
scaler = StandardScaler()
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [58]:
for col in numerical_cols:
      if col not in X_train.columns:
              raise ValueError(f"Expected numerical column {col} not found!")
      X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
      X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [59]:
# Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [64]:
X_train

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
2142,0,-0.465683,-0.000474,-0.421345,False,False,True,True,False,False,...,False,False,False,True,True,False,False,False,False,True
1623,0,0.885537,1.074754,1.255888,False,False,False,True,False,True,...,False,True,False,True,False,True,True,False,False,False
6074,0,-1.284605,-1.376499,-1.002151,True,True,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False
1362,0,-1.161766,0.177346,-0.907292,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,True,False
6754,0,-1.325551,-0.098524,-0.394513,True,False,True,True,False,True,...,False,False,False,False,False,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,0,-1.284605,1.001632,-0.970509,True,True,False,True,False,False,...,False,True,False,True,False,False,True,False,True,False
5191,0,-0.383791,0.872006,-0.040362,False,True,True,True,False,True,...,False,True,False,True,False,True,True,True,False,False
5226,0,-0.834198,-1.452945,-0.877176,True,True,True,True,False,False,...,True,False,True,False,False,False,True,False,True,False
5390,1,-0.834198,1.149538,-0.481776,True,False,False,True,False,True,...,False,True,False,True,False,False,True,False,True,False


In [65]:
X_test

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
185,0,-1.284605,-1.331629,-1.001554,False,True,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False
2715,0,0.353238,-1.316672,-0.571859,True,False,False,True,False,True,...,True,False,True,False,False,False,True,False,False,False
3825,0,0.803645,-1.512772,-0.556270,False,True,True,True,False,False,...,True,False,True,False,False,True,False,False,False,True
1807,0,-1.284605,0.381756,-0.978757,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,True,False
132,0,1.417836,-0.475768,0.429201,True,False,False,True,False,False,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6366,0,1.294997,0.114195,0.923926,False,True,False,True,False,False,...,False,False,False,True,False,True,True,False,False,True
315,0,0.762699,1.501854,1.502189,True,True,True,True,False,True,...,False,True,False,True,True,False,False,True,False,False
2439,0,-0.629468,-1.494492,-0.866695,True,True,True,True,False,False,...,True,False,True,False,True,False,False,False,False,False
5002,0,1.499728,-0.695134,0.296531,False,True,True,False,True,False,...,False,False,False,True,False,True,True,True,False,False


In [66]:
y_test

185     1
2715    0
3825    0
1807    1
132     0
       ..
6366    0
315     0
2439    0
5002    0
1161    1
Name: Churn, Length: 1409, dtype: int64

In [67]:
y_train

2142    0
1623    0
6074    1
1362    1
6754    0
       ..
3772    1
5191    0
5226    0
5390    1
860     0
Name: Churn, Length: 5634, dtype: int64

In [60]:
# Predict
y_pred = model.predict(X_test)

In [61]:
# Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 82.19%


In [62]:
# Save the model using pickle
#model_filename = os.path.join(script_dir, 'model.pkl')
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)
    print(f"Trained model saved as 'model.pkl'")

Trained model saved as 'model.pkl'


In [63]:
scaler_filename = 'scaler.pkl'
with open(scaler_filename, 'wb') as file:
    pickle.dump(scaler, file)

print(f"Trained scaler saved as '{scaler_filename}'")

Trained scaler saved as 'scaler.pkl'
