# CRISP DM

In [None]:
Bussines Understanding

- Problem
Churn adalah keadaan di mana pelanggan memutuskan untuk berhenti berlangganan atau membeli produk.
Sehingga, churn rate adalah persentase pelanggan yang berhenti berlangganan atau membeli produk. Dalam
dataset ini berisi tentang menutup atau tidaknya rekening nasabah bank.

- Tujuan
Dataset ini diambil dari Kaggle (https://www.kaggle.com/datasets/shrutimechlearn/churn-modelling)
bertujuan untuk menganalisis, memprediksi dan mengklasifikasi nasabah apakah nasabah tersebut akan
menutup rekeningnya atau tidak. Jika banyak nasabah yang menutup rekening, ini akan menjadi evaluasi
bagi pihak bank.

In [None]:
Data Understanding

Berikut atribut-atribut dari dataset ini :

RowNumber = no urut baris
CustomerId = ID nasabah
Surname = nama belakang nasabah
CreditScore = kredit skor nasabah
Geography = negara nasabah berasal
Gender = jenis kelamin nasabah
Age = usia nasabah
Tenure = jumlah tahun menjadi nasabah di bank tersebut
Balance = saldo nasabah
NumOfProducts = jumlah produk bank yang digunakan nasabah
HasCrCard = apakah nasabah memiliki kartu kredit atau tidak
IsActiveMember = apakah nasabah tersebut aktif atau pasif
EstimatedSalary = estimasi gaji
Exited = apakah nasabah menutup rekeningnya atau tidak

In [None]:
Data Preparation

Yang menjadi fitur dalam dataset ini adalah:

CustomerId
CreditScore
Gender
Age
Tenure
Balance
NumOfProducts
HasCrCard
IsActiveMember
EstimatedSalary

Sedangkan, yang menjadi target adalah Exited

# Import library

In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import  train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [47]:
df = pd.read_csv("Churn_Modelling.csv")

In [48]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [51]:
# memisahkan numerik dan kategori
numerik = []
kategori = []

for col in df.columns:
    if df[col].dtype == 'int64':
        numerik.append(col)
    elif df[col].dtype == 'float64':
        numerik.append(col)
    else:
        kategori.append(col)

for col in df.columns:
    if col in numerik:
        df[col].fillna(df[col].median(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

In [52]:
numerik

['RowNumber',
 'CustomerId',
 'CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary',
 'Exited']

In [53]:
kategori

['Surname', 'Geography', 'Gender']

In [54]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in kategori:
    df[col] = le.fit_transform(df[col])

In [55]:
df.to_csv('churn.csv')

In [56]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,1115,619,0,0,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,1177,608,2,0,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,2040,502,0,0,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,289,699,0,0,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,1822,850,2,0,43,2,125510.82,1,1,1,79084.1,0


# Split data

In [57]:
X = df.drop(columns=['RowNumber', 'Surname', 'Geography', 'Exited'], axis=1)
Y = df['Exited']

In [58]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [59]:
print(X.shape, x_train.shape, x_test.shape)

(10000, 10) (8000, 10) (2000, 10)


# Membuat model training

In [78]:
from sklearn.neighbors import KNeighborsClassifier

knn =  KNeighborsClassifier(n_neighbors=3)

In [79]:
model = knn.fit(x_train, y_train)

# Evaluasi model

In [81]:
x_train_predict = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_predict, y_train)

In [82]:
print('Akurasi data training : ', training_data_accuracy)

Akurasi data training :  0.84025


In [83]:
x_test_predict = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_predict, y_test)

In [84]:
print('Akurasi data testing : ', test_data_accuracy)

Akurasi data testing :  0.7375


# Model Prediksi

In [87]:
input_data = (15619304,502,0,42,8,159660.8,3,1,0,113931.57)
input_data_numpy = np.asarray(input_data)
data_reshaped = input_data_numpy.reshape(1, -1)
prediksi = model.predict(data_reshaped)
print(prediksi)

if (prediksi[0] == 0):
    print('Nasabah bertahan dengan rekening banknya')
else:
    print('Nasabah menutup rekening banknya')

[1]
Nasabah menutup rekening banknya




# Save Model

In [88]:
import pickle

filename = 'churn_bank.sav'
pickle.dump(model, open(filename, 'wb'))