# Data loading and preprocessing

# Handling missing values & data cleaning

# Descriptive statistics and summary

# Visualization (Univariate, bivariate,multivariate)

# Insights on churn behavior

In [1]:
# Ioad important libraries for EDA project

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# lets load the data into df variable

In [4]:
df = pd.read_csv("/content/sample_data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [6]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
# As we seen , the data is huge.so all columns no showing
#so lets check the total columns name

In [8]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [10]:
# lets check the shape of data

In [11]:
df.shape

(7043, 21)

In [12]:
# the dataser contain 7043 rows indicate customers with 21 features
# Each row represent a unique customer with various attributes
# related demographic,service,and billing
# the dataset size is sufficient for meaningful analysis and prediction
# with 21 features,we have a good mix of categorical and numerical
# to explore customer behavior and churn pattern

In [13]:
# lets check deta type of each feature

In [35]:
  df.dtypes

Unnamed: 0,0
customerID,object
gender,object
SeniorCitizen,int64
Partner,object
Dependents,object
tenure,int64
PhoneService,object
MultipleLines,object
InternetService,object
OnlineSecurity,object


In [20]:
# most of the columns are categorical(objec type)
# seniorcirizen is stored in int64 , but since its a binary variable
# 0 & 1 it be treted as categorical
# total charges is stores as an object, which seems incorrect
# since it represent a numerical values
# it may contain missing values or be formatted incorrect
# we need to convert it to float for proper analysis

In [23]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors= 'coerce')

In [24]:
# check the conversion

In [34]:
df['TotalCharges'].dtype

dtype('float64')

In [26]:
# Check for missing values

In [28]:
# only one columns totalcharges has missing values
# 11 missing values
#since total charges is numerical , these missing values were
# likely empty spaces in original dataset before conversion
# possible ways to handle missing values:-
# 1 - Fill them with median or mean of totalcharges
# 2 - fill them with zero , but total charges not be zero
# 3 - Drop these 11 rows, but if we do 11 customer also drop
# from 7043 . we may loss data
# so better to fill the missing value
# fill them with median because is good if we may contain outliers

In [29]:
df['TotalCharges'].median()

1397.475

In [30]:
df['TotalCharges'].mean()

np.float64(2283.3004408418656)

In [33]:
# lets check dublicate records