In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split #splitting data test and train
import joblib
import yaml
import os
from tqdm import tqdm

In [2]:
params_dir = "config/Churn_.yaml"

In [3]:
def load_params(param_dir):
    with open(param_dir, 'r') as file:
        params = yaml.safe_load(file)
        
    return params

In [4]:
params = load_params(params_dir)

# 1. Data Collecting

In [5]:
# read file and marge data
def read_dataset(dataset_dir):
    dataset = pd.DataFrame()

    for i in tqdm(os.listdir(dataset_dir)):
        dataset = pd.concat([pd.read_excel(dataset_dir + i), dataset])
    
    return dataset

In [6]:
# read dataset
df = read_dataset(params["dataset_dir"])

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.09s/it]


In [7]:
# drop columns

df = df.drop(['customer_id','products_number'], axis=1)

In [8]:
df

Unnamed: 0,credit_score,country,gender,age,tenure,balance,credit_card,active_member,estimated_salary,churn
0,619,France,Female,42,2,0.00,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,1,0,113931.57,1
3,699,France,Female,39,1,0.00,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,1,0,92888.52,1


# 2. Data Definition

# 3. Data Validation

In [9]:
# check missing nulls
df.isnull().sum()

credit_score        0
country             0
gender              0
age                 0
tenure              0
balance             0
credit_card         0
active_member       0
estimated_salary    0
churn               0
dtype: int64

In [10]:
# check duplicate
df.duplicated().sum()

0

In [11]:
# drop zeroes balance
df = df[df['balance'] != 0]

In [12]:
# check dtypes
df.dtypes

credit_score          int64
country              object
gender               object
age                   int64
tenure                int64
balance             float64
credit_card           int64
active_member         int64
estimated_salary    float64
churn                 int64
dtype: object

In [13]:
# Menentukan batasan bin
bins = [15, 30, 60, 100]

# Menentukan nilai angka untuk masing-masing bin
labels = ['young', 'mature', 'old']

# Membuat kolom baru untuk menampung hasil binning
df['Age'] = pd.cut(df['age'], bins, labels=labels).astype('object')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Age'] = pd.cut(df['age'], bins, labels=labels).astype('object')


In [14]:
# check missing nulls after binning
df.Age.isnull().sum()

0

In [15]:
# drop columns after binning

df = df.drop('age', axis=1)

In [16]:
df.dtypes

credit_score          int64
country              object
gender               object
tenure                int64
balance             float64
credit_card           int64
active_member         int64
estimated_salary    float64
churn                 int64
Age                  object
dtype: object

In [17]:
# make function separate input, output
def SeparateOutputInput(data,
                       output_column_name):
    """
    The Function for separate data input and output
    input data for target/output and the drop or separate
    make new variable for input and output data
    """
    
    output_data = data[output_column_name]
    input_data = data.drop(output_column_name,
                           axis = 1)
    
    return input_data, output_data

In [18]:
# call function
X, y = SeparateOutputInput(data = df,
                          output_column_name = "churn")

# 4. Data Defense

In [19]:
def check_data(input_data, params):
    # check data types
    assert input_data.select_dtypes("object").columns.to_list() == params["object_columns"], "an error occurs in object column(s)."
    assert input_data.select_dtypes("int64").columns.to_list() == params["int_columns"], "an error occurs in int64 column(s)."
    assert input_data.select_dtypes("float64").columns.to_list() == params["float_columns"], "an error occurs in float64 column(s)."
    
    # check range of data
    assert input_data.balance.between(params["range_balance"][0], params["range_balance"][1]).sum() == len(input_data), "an error occurs in balance range."
    assert input_data.credit_score.between(params["range_credit_score"][0], params["range_credit_score"][1]).sum() == len(input_data), "an error occurs in credit_score range."
    assert input_data.tenure.between(params["range_tenure"][0], params["range_tenure"][1]).sum() == len(input_data), "an error occurs in tenure range."
    assert input_data.estimated_salary.between(params["range_estimated_salary"][0], params["range_estimated_salary"][1]).sum() == len(input_data), "an error occurs estimated_salary range."

In [20]:
# check data
check_data(df, params)

# 5. Data Spliting

In [21]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    random_state = 123, stratify=y)
# print shape
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5106, 9)
(5106,)
(1277, 9)
(1277,)


In [22]:
# save data
joblib.dump(X_train,"D:/BOOTCAMP/project/Project Pribadi/ml churn/X_train.csv")
joblib.dump(y_train,"D:/BOOTCAMP/project/Project Pribadi/ml churn/y_train.csv")
joblib.dump(X_test,"D:/BOOTCAMP/project/Project Pribadi/ml churn/X_test.csv")
joblib.dump(y_test,"D:/BOOTCAMP/project/Project Pribadi/ml churn/y_test.csv")
joblib.dump(df,"D:/BOOTCAMP/project/Project Pribadi/ml churn/data.csv")

['D:/BOOTCAMP/project/Project Pribadi/ml churn/data.csv']