In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

In [4]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")


In [5]:
# Ukuran data (baris, kolom)
df.shape
# Daftar nama kolom
df.columns
# Tipe data dan non-null count
df.info()          

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
df.head()
# Melihat semua kolom yang ada
pd.set_option('display.max_columns', None)
print(df.head())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity OnlineBackup  \
0  No phone service             DSL             No          Yes   
1                No             DSL            Yes           No   
2                No             DSL            Yes          Yes   
3  No phone service             DSL            Yes           No   
4                No     Fiber optic             No           No   

  DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
0               No          No        

In [7]:
# Mengecek type data
df['TotalCharges'].dtype

# Konversi kolom TotalCharges ke numerik (float)
# Jika ada nilai yang tidak bisa dikonversi (contohnya spasi kosong), akan diubah menjadi NaN 
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Menghapus baris yang memiliki nilai kosong pada kolom TotalCharges
df.dropna(subset=['TotalCharges'], inplace=True)

# Mengecek berapa banyak nilai kosong (NaN) di kolom TotalCharges setelah konversi
df['TotalCharges'].isnull().sum()

0

In [None]:
df.describe()           # Statistik numerik
df.describe(include='object')  # Statistik kolom kategori

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x='Contract', hue='Churn', data=df)
plt.title('Distribusi Churn Berdasarkan Jenis Kontrak')
plt.xlabel('Jenis Kontrak')
plt.ylabel('Jumlah Pelanggan')
plt.legend(title='Churn')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x='InternetService', hue='Churn', data=df)
plt.title('Churn Berdasarkan Jenis Layanan Internet')
plt.xlabel('Internet Service')
plt.ylabel('Jumlah Pelanggan')
plt.legend(title='Churn')
plt.show()

pd.crosstab(df['InternetService'], df['Churn'])

In [None]:
pd.crosstab(df['InternetService'], df['Churn'], normalize='index') * 100

In [None]:
# Buat kelompok tenure ke dalam interval bulan
bins = [0, 12, 24, 36, 48, 60, 72]
labels = ['0-12', '13-24', '25-36', '37-48', '49-60', '61-72']
df['TenureGroup'] = pd.cut(df['tenure'], bins=bins, labels=labels, right=True)

In [None]:
tenure_churn = pd.crosstab(df['TenureGroup'], df['Churn'], normalize='index') * 100
tenure_churn.plot(kind='bar', stacked=True, colormap='Set2')
plt.title('Persentase Churn Berdasarkan Kelompok Tenure')
plt.ylabel('Persentase (%)')
plt.xlabel('Kelompok Tenure (bulan)')
plt.legend(title='Churn')
plt.show()

In [None]:
bins = [0, 35, 70, 105, 120]
labels = ['Low (0–35)', 'Medium (35–70)', 'High (70–105)', 'Very High (105–120)']
df['ChargeGroup'] = pd.cut(df['MonthlyCharges'], bins=bins, labels=labels, right=False)

In [None]:
pd.crosstab(df['ChargeGroup'], df['Churn'], normalize='index') * 100

In [None]:
charge_churn = pd.crosstab(df['ChargeGroup'], df['Churn'], normalize='index') * 100
charge_churn.plot(kind='bar', stacked=True, colormap='coolwarm')
plt.title('Persentase Churn Berdasarkan Kelompok Biaya Bulanan')
plt.xlabel('Kelompok Biaya Bulanan ($)')
plt.ylabel('Persentase (%)')
plt.legend(title='Churn')
plt.xticks(rotation=45)
plt.show()

In [None]:
df.to_csv("cleaned_telco_churn.csv", index=False)