In [22]:
import pandas as pd 
data = pd.read_csv("telecom_data.csv")
print(data.head(10))

   Age  Gender  PlanType  MonthlyUsage Churn
0   21  Female   Regular            15    No
1   45  Female   Economy            41    No
2   44  Female   Economy            40    No
3   31  Female   Regular            23   Yes
4   33  Female   Regular            12    No
5   42  Female   Regular            52    No
6   20  Female     Ultra            57   Yes
7   26    Male     Ultra            23    No
8   37  Female  Advanced            31    No
9   26    Male   Economy            23    No


In [23]:
print('Columns',data.columns.to_list())

print('\nDataset info: \n',data.info())

print("\nDataset Completeness: \n",data.isnull().sum())

print("\n\nDataset Consistency: \n",data.dtypes)

print("\nDescribe dataset: \n", data.describe())


Columns ['Age', 'Gender', 'PlanType', 'MonthlyUsage', 'Churn']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Age           150 non-null    int64 
 1   Gender        150 non-null    object
 2   PlanType      150 non-null    object
 3   MonthlyUsage  150 non-null    int64 
 4   Churn         150 non-null    object
dtypes: int64(2), object(3)
memory usage: 6.0+ KB

Dataset info: 
 None

Dataset Completeness: 
 Age             0
Gender          0
PlanType        0
MonthlyUsage    0
Churn           0
dtype: int64


Dataset Consistency: 
 Age              int64
Gender          object
PlanType        object
MonthlyUsage     int64
Churn           object
dtype: object

Describe dataset: 
               Age  MonthlyUsage
count  150.000000    150.000000
mean    35.193333     33.693333
std     10.841566     15.923031
min     19.000000      3.000000
25% 

In [24]:
print("\n Dataset Bias\n")
print(data['Gender'].value_counts(normalize=True))
print(data['Churn'].value_counts(normalize=True))
#print(data['Age'].value_counts(normalize=True))
print(data['PlanType'].value_counts(normalize=True))



 Dataset Bias

Gender
Female    0.793333
Male      0.206667
Name: proportion, dtype: float64
Churn
No     0.893333
Yes    0.106667
Name: proportion, dtype: float64
PlanType
Advanced    0.273333
Ultra       0.220000
Regular     0.213333
Standard    0.160000
Economy     0.133333
Name: proportion, dtype: float64


| Field | Details |
|------|--------|
| **Dataset Name** | `telecom_churn_data.csv` |
| **Description** | Contains customer demographic and usage details from a telecom company used to predict churn behavior. The dataset is used to demonstrate **data quality checks**, **feature encoding**, and **model evaluation** using **Logistic Regression**. |
| **Features** | Age, Gender, PlanType, MonthlyUsage |
| **Target Variable** | Churn (Yes = churned, No = retained) |
| **Number of Rows** | 150 |
| **Null Handling** | No missing values detected across any feature (`isnull().sum()` confirms all zeroes) |
| **Feature Types** | - **Age**: Numeric (int)<br>- **Gender**: Categorical (object, Nominal)<br>- **PlanType**: Categorical (object, Nominal)<br>- **MonthlyUsage**: Numeric (int)<br>- **Churn**: Categorical (binary target) |


In [25]:
fdata = data.drop(columns=['Churn'])
x = fdata
y = data["Churn"].map({'Yes':1, 'No':0})

categorical = fdata.select_dtypes(include='object').columns
numerical= fdata.select_dtypes(exclude='object').columns

print("Cat: \n", list(categorical))
print("Num: \n", list(numerical))

Cat: 
 ['Gender', 'PlanType']
Num: 
 ['Age', 'MonthlyUsage']


In [26]:
#Feature encoding

from sklearn.preprocessing import OneHotEncoder, StandardScaler

x_encoded_gd = pd.get_dummies(
    x, columns=categorical,
    drop_first=True
)
print("gd Encoded Columnsa: ", x_encoded_gd.columns.tolist())
#print(x_encoded_gd['Gender_Male'])
print(x_encoded_gd['PlanType_Economy'])
print(x_encoded_gd)

gd Encoded Columnsa:  ['Age', 'MonthlyUsage', 'Gender_Male', 'PlanType_Economy', 'PlanType_Regular', 'PlanType_Standard', 'PlanType_Ultra']
0      False
1       True
2       True
3      False
4      False
       ...  
145    False
146    False
147    False
148     True
149    False
Name: PlanType_Economy, Length: 150, dtype: bool
     Age  MonthlyUsage  ...  PlanType_Standard  PlanType_Ultra
0     21            15  ...              False           False
1     45            41  ...              False           False
2     44            40  ...              False           False
3     31            23  ...              False           False
4     33            12  ...              False           False
..   ...           ...  ...                ...             ...
145   19             3  ...              False           False
146   42            52  ...              False           False
147   54             7  ...              False            True
148   20            57  ...           

In [27]:
#one hot encoding

ohe = OneHotEncoder(
    drop='first', sparse_output=False,
)

encoded_array = ohe.fit_transform(x[categorical])
#print(encoded_array)

#covert back to DF.
encoded_df = pd.DataFrame(
    encoded_array, columns = ohe.get_feature_names_out(categorical)
)
print(encoded_df)

x_encoded_ohe = pd.concat(
    [x[numerical].reset_index(drop=True),encoded_df.reset_index(drop=True)],
    axis=1
)

     Gender_Male  PlanType_Economy  ...  PlanType_Standard  PlanType_Ultra
0            0.0               0.0  ...                0.0             0.0
1            0.0               1.0  ...                0.0             0.0
2            0.0               1.0  ...                0.0             0.0
3            0.0               0.0  ...                0.0             0.0
4            0.0               0.0  ...                0.0             0.0
..           ...               ...  ...                ...             ...
145          0.0               0.0  ...                0.0             0.0
146          0.0               0.0  ...                0.0             0.0
147          0.0               0.0  ...                0.0             1.0
148          0.0               1.0  ...                0.0             0.0
149          0.0               0.0  ...                1.0             0.0

[150 rows x 5 columns]


In [28]:
scalar = StandardScaler()
x_scaled_gd = x_encoded_gd.copy()
x_scaled_gd[numerical]= scalar.fit_transform(x_scaled_gd[numerical])
print(x_scaled_gd)

x_scaled_ohe = x_encoded_ohe.copy()
x_scaled_ohe[numerical] = scalar.fit_transform(x_scaled_ohe[numerical])
print(x_scaled_ohe)

df = pd.DataFrame({
    'Age_ohe': x_scaled_ohe[numerical]['Age'],
    'Age_gd': x_scaled_gd[numerical]['Age'],
    'MonthlyUsage_ohe': x_scaled_ohe[numerical]['MonthlyUsage'],
    'MonthlyUsage_gd': x_scaled_gd[numerical]['MonthlyUsage'],
    
})

print(df.head(10))

          Age  MonthlyUsage  ...  PlanType_Standard  PlanType_Ultra
0   -1.313545     -1.177914  ...              False           False
1    0.907574      0.460411  ...              False           False
2    0.815027      0.397399  ...              False           False
3   -0.388079     -0.673814  ...              False           False
4   -0.202986     -1.366951  ...              False           False
..        ...           ...  ...                ...             ...
145 -1.498638     -1.934064  ...              False           False
146  0.629934      1.153549  ...              False           False
147  1.740493     -1.682014  ...              False            True
148 -1.406091      1.468612  ...              False           False
149 -0.388079     -0.673814  ...               True           False

[150 rows x 7 columns]
          Age  MonthlyUsage  ...  PlanType_Standard  PlanType_Ultra
0   -1.313545     -1.177914  ...                0.0             0.0
1    0.907574      0.460

In [33]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x_scaled_ohe,
    y,
    test_size = 0.20,
    random_state = 42,
    stratify = y
)

pd.DataFrame({
    "Train": y_train.value_counts(normalize=True),
    "Test": y_test.value_counts(normalize=True)
})

Unnamed: 0_level_0,Train,Test
Churn,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.891667,0.9
1,0.108333,0.1
