# *IRIS.BARUBELAJAR | KAGGLE COMPETITION | CUSTOMER CHURN*

## IMPORT LIBRARY

In [128]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
import optuna

## IMPORT DATASET

In [129]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [130]:
df_train.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15687492,596,jabar,Male,32.0,3.0,150268487.1,2.0,0.0,0.0,64931610.0,0
1,15736963,623,Jawa Timur,Male,43.0,1.0,0.0,2.0,1.0,1.0,227447100.0,0
2,15721730,601,Jawa Tengah,Female,44.0,4.0,0.0,2.0,1.0,0.0,90993730.0,0
3,15762134,506,Jawa Barat,Male,59.0,8.0,185140916.0,2.0,1.0,1.0,265205600.0,0
4,15648898,560,Jawa Tengah,Female,27.0,,194221253.6,1.0,1.0,1.0,178176200.0,0


In [131]:
df_test.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary
0,15782993,624,Jawa Timur,Male,51,10,191743610.0,2,1,1,198617400.0
1,15640442,717,Jawa Timur,Male,31,4,201565523.7,1,0,0,63981020.0
2,15606003,566,Jawa Timur,Female,21,3,0.0,2,1,1,5634882.0
3,15672374,672,Jawa Timur,Male,52,8,264163135.8,1,0,0,87646980.0
4,15630725,649,Jawa Timur,Female,45,5,144173768.0,1,1,0,269379400.0


In [132]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customer_id       7000 non-null   int64  
 1   credit_score      7000 non-null   int64  
 2   country           6845 non-null   object 
 3   gender            7000 non-null   object 
 4   age               6759 non-null   float64
 5   tenure            6712 non-null   float64
 6   balance           6883 non-null   float64
 7   products_number   6852 non-null   float64
 8   credit_card       6841 non-null   float64
 9   active_member     6863 non-null   float64
 10  estimated_salary  6855 non-null   float64
 11  churn             7000 non-null   int64  
dtypes: float64(7), int64(3), object(2)
memory usage: 656.4+ KB


In [133]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customer_id       3000 non-null   int64  
 1   credit_score      3000 non-null   int64  
 2   country           3000 non-null   object 
 3   gender            3000 non-null   object 
 4   age               3000 non-null   int64  
 5   tenure            3000 non-null   int64  
 6   balance           3000 non-null   float64
 7   products_number   3000 non-null   int64  
 8   credit_card       3000 non-null   int64  
 9   active_member     3000 non-null   int64  
 10  estimated_salary  3000 non-null   float64
dtypes: float64(2), int64(7), object(2)
memory usage: 257.9+ KB


In [134]:
df_train['churn'].value_counts()


0    5590
1    1410
Name: churn, dtype: int64

In [135]:
df_train['country'].value_counts()

Jawa Timur     2758
Jawa Barat     1414
Jawa Tengah    1336
jatim           337
jawatimur       329
jawatengah      175
jawabarat       174
jabar           161
jateng          161
Name: country, dtype: int64

## DATA PREPROCESSING

### DROP FEATURES CUSTOMER_ID

In [136]:
df_train = df_train.drop(['customer_id'], axis=1)

## CEK TIPE DATA

In [137]:
df_train.dtypes

credit_score          int64
country              object
gender               object
age                 float64
tenure              float64
balance             float64
products_number     float64
credit_card         float64
active_member       float64
estimated_salary    float64
churn                 int64
dtype: object

### MISSING VALUE

#### country


In [138]:
print(df_train['country'].unique())

['jabar' 'Jawa Timur' 'Jawa Tengah' 'Jawa Barat' 'jateng' 'jawabarat'
 'jawatimur' 'jatim' 'jawatengah' nan]


In [139]:
# Memastikan seluruh data tertulis dalam kapital
df_train['country'] = df_train['country'].str.capitalize()
df_train["country"]

0             Jabar
1        Jawa timur
2       Jawa tengah
3        Jawa barat
4       Jawa tengah
           ...     
6995     Jawa timur
6996      Jawabarat
6997      Jawatimur
6998     Jawa timur
6999    Jawa tengah
Name: country, Length: 7000, dtype: object

In [140]:
print(df_train['country'].unique())

['Jabar' 'Jawa timur' 'Jawa tengah' 'Jawa barat' 'Jateng' 'Jawabarat'
 'Jawatimur' 'Jatim' 'Jawatengah' nan]


In [141]:


# Daftar pemetaan untuk penggantian
mapping = {'Jabar': 'Jawa Barat', 'Jatim': 'Jawa Timur', 'Jateng': 'Jawa Tengah', 'Jawa barat': 'Jawa Barat', 'Jawa timur':'Jawa Timur', 'Jawa tengah': 'Jawa Tengah'}

# Melakukan penggantian nilai
df_train['country'] = df_train['country'].replace(mapping)

# Tampilkan hasil
print(df_train['country'].unique())


['Jawa Barat' 'Jawa Timur' 'Jawa Tengah' 'Jawabarat' 'Jawatimur'
 'Jawatengah' nan]


## gender

In [142]:
print(df_train['gender'].unique())

['Male' 'Female']


In [143]:
print(df_train.isna().sum())

credit_score          0
country             155
gender                0
age                 241
tenure              288
balance             117
products_number     148
credit_card         159
active_member       137
estimated_salary    145
churn                 0
dtype: int64


### MEMISAHKAN DATA NUMERIK DAN KATEGORIK

In [144]:
# Memisahkan data numerik dan kategorik
numeric_features = ['credit_score', 'age', 'tenure', 'balance', 'products_number', 'estimated_salary']
binary_features = ['credit_card', 'active_member']
categorical_features = ['country', 'gender']
df_train.head()

Unnamed: 0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,596,Jawa Barat,Male,32.0,3.0,150268487.1,2.0,0.0,0.0,64931610.0,0
1,623,Jawa Timur,Male,43.0,1.0,0.0,2.0,1.0,1.0,227447100.0,0
2,601,Jawa Tengah,Female,44.0,4.0,0.0,2.0,1.0,0.0,90993730.0,0
3,506,Jawa Barat,Male,59.0,8.0,185140916.0,2.0,1.0,1.0,265205600.0,0
4,560,Jawa Tengah,Female,27.0,,194221253.6,1.0,1.0,1.0,178176200.0,0


### HANDLING MISSING VALUE

In [152]:
numeric_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=10, random_state=0)),
    ('scaler', StandardScaler())
])

binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('bin', binary_transformer, binary_features),
        ('cat', categorical_transformer, categorical_features)
    ])


AttributeError: 'numpy.ndarray' object has no attribute 'isna'

## HANDLING TYPE DATA

In [147]:
df.dtypes

NameError: name 'df' is not defined

In [None]:
col_int = ['age', 'tenure', 'balance', 'products_number', 'credit_card', 'active_member', 'estimated_salary']
df[col_int] = df[col_int].astype('int64')

In [None]:
df.dtypes

In [None]:
unique_values = np.unique(df['churn'])
print(unique_values)

## EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Misalkan df adalah DataFrame Anda

# Memilih hanya kolom-kolom numerik
numeric_cols = df.select_dtypes(include='number').columns

# Set ukuran gambar dan jumlah kolom pada subplot
num_cols = len(numeric_cols)
fig, axes = plt.subplots(nrows=1, ncols=num_cols, figsize=(16, 4))

# Iterasi melalui setiap kolom numerik dan membuat KDE plot
for i, col in enumerate(numeric_cols):
    # KDE plot
    sns.kdeplot(df[col], ax=axes[i])
    axes[i].set_title(f'KDE Plot - {col}')

# Menampilkan gambar KDE plot
plt.show()

# Set ukuran gambar dan jumlah kolom pada subplot
fig, axes = plt.subplots(nrows=1, ncols=num_cols, figsize=(16, 4))

# Iterasi melalui setiap kolom numerik dan membuat boxplot
for i, col in enumerate(numeric_cols):
    # Boxplot
    sns.boxplot(x=df[col], ax=axes[i])
    axes[i].set_title(f'Boxplot - {col}')

# Menampilkan gambar boxplot
plt.show()


In [None]:
import pandas as pd

# Fungsi untuk menangani outlier menggunakan IQR
def handle_outliers_iqr(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Mengganti nilai outlier dengan batas atas dan batas bawah
    column = column.apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))
    
    return column

# Menangani outlier hanya pada fitur, bukan target
for col in df.select_dtypes(include='number').columns:
    if col != 'churn':
        df[col] = handle_outliers_iqr(df[col])

# Menampilkan DataFrame setelah menangani outlier
print(df)


In [None]:
import pandas as pd

# Misal df adalah DataFrame hasil dari data preprocessing
num_rows, num_columns = df.shape

print("Jumlah baris setelah data preprocessing:", num_rows)


In [None]:
unique_values = np.unique(df['churn'])
print(unique_values)

In [None]:
df['churn']

## Label encoding

In [None]:


from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df["credit_score"] = le.fit_transform(df["credit_score"])
df["country"] = le.fit_transform(df["country"])
df["gender"] = le.fit_transform(df["gender"])
df["age"] = le.fit_transform(df["age"])
df["tenure"] = le.fit_transform(df["tenure"])
df["balance"] = le.fit_transform(df["balance"])
df["products_number"] = le.fit_transform(df["products_number"])
df["credit_card"] = le.fit_transform(df["credit_card"])
df["active_member"] = le.fit_transform(df["active_member"])
df["estimated_salary"] = le.fit_transform(df["estimated_salary"])
df.head()

In [None]:
import pandas as pd

# Misal df adalah DataFrame hasil dari data preprocessing
num_rows, num_columns = df.shape

print("Jumlah baris setelah data preprocessing:", num_rows)


In [None]:
print(test.columns)


In [None]:
from sklearn.model_selection import train_test_split
import optuna
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Memisahkan variabel independen dan dependen
X = df.drop(columns=['churn'])
y = df['churn']

test_size = 3000 / 7000  # Proporsi data pengujian yang diinginkan


# Membagi data menjadi train dan tes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= test_size, random_state=42)

# Oversampling dengan SMOTE hanya pada data training
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

# Fungsi objektif untuk keseluruhan pembelajaran model
def objective(trial):
    try:
        # Hyperparameter terbaik akan ditemukan dari sini
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'max_depth': trial.suggest_int('max_depth', 5, 15),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 10),
            'max_features': trial.suggest_float('max_features', 0.5, 1.0),
            'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
        }

        # Melatih model dengan data oversampled
        rf_clf = RandomForestClassifier(**param, random_state=42)
        rf_clf.fit(X_train, y_train)

        # Melakukan Prediksi pada data uji yang sebenarnya tidak perlu di-oversample
        preds = rf_clf.predict(X_test)
        
        # Menghitung Akurasi
        accuracy = accuracy_score(y_test, preds)
        return 1.0 - accuracy  # Minimalkan 1 - akurasi karena Optuna mencari nilai minimum

    except Exception as e:
        print(e)
        return np.inf  # Mengembalikan nilai yang besar untuk menunjukkan kegagalan

# Memastikan objek pembelajaran oleh Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Parameter Terbaik
best_params = study.best_params
print('Best parameters:', best_params)

# Melatih model dengan parameter terbaik
best_rf_clf = RandomForestClassifier(**best_params, random_state=42)

# Melakukan oversampling pada data latih sebelum melatih model akhir
X_train, y_train = smote.fit_resample(X_train, y_train)
best_rf_clf.fit(X_train, y_train)

# Melakukan prediksi pada data uji
y_pred = best_rf_clf.predict(X_test)

# Evaluasi model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


In [None]:

# Melakukan cross-validation dengan F1 score sebagai metrik
f1_scorer = make_scorer(y_pred)
cross_val_scores = cross_val_score(model, X_train, y_train, cv=5, scoring=f1_scorer)

# Menampilkan hasil cross-validation
print("Cross-Validation F1 Scores:", cross_val_scores)
print("Mean F1 Score:", np.mean(cross_val_scores))


In [None]:
import pandas as pd

# Misal X_test dan y_test adalah Pandas DataFrame
num_rows_X_test = X_test.shape[0]
num_rows_y_test = y_test.shape[0]

print("Jumlah baris X_test:", num_rows_X_test)
print("Jumlah baris y_test:", num_rows_y_test)


In [None]:
# from sklearn.ensemble import RandomForestClassifier

# rf_clf = RandomForestClassifier(random_state=42)
# rf_clf.fit(X_train_over, y_train_over)


In [None]:
# y_pred = rf_clf.predict(X_test)


In [None]:
# input_data_shape = y_pred.shape
# print("Jumlah baris yang akan diuji:", input_data_shape[0])


In [None]:
# from sklearn.metrics import accuracy_score, classification_report

# accuracy = accuracy_score(y_test, y_pred)
# report = classification_report(y_test, y_pred)

# print(f'Accuracy: {accuracy}')
# print(f'Classification Report:\n{report}')


In [None]:
# from sklearn.svm import SVC

# # Inisialisasi model SVM
# svm_clf = SVC(random_state=42)

# # Melatih model
# svm_clf.fit(X_train_over, y_train_over)

# # Membuat prediksi
# y_pred_svm = svm_clf.predict(X_test)

# # Evaluasi model
# accuracy_svm = accuracy_score(y_test, y_pred_svm)
# report_svm = classification_report(y_test, y_pred_svm)

# print(f'SVM Accuracy: {accuracy_svm}')
# print(f'SVM Classification Report:\n{report_svm}')


In [None]:
# # Contoh menggunakan XGBoost
# from xgboost import XGBClassifier

# # Inisialisasi model XGBoost
# xgb_clf = XGBClassifier(random_state=42)

# # Melatih model
# xgb_clf.fit(X_train_over, y_train_over)

# # Membuat prediksi
# y_pred_xgb = xgb_clf.predict(X_test)

# # Evaluasi model
# accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
# report_xgb = classification_report(y_test, y_pred_xgb)

# print(f'XGBoost Accuracy: {accuracy_xgb}')
# print(f'XGBoost Classification Report:\n{report_xgb}')


In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, classification_report

# # Inisialisasi model RandomForest dengan penanganan ketidakseimbangan kelas
# rf_clf = RandomForestClassifier(class_weight='balanced', random_state=42)

# # Melatih model pada data yang sudah dioversampling
# rf_clf.fit(X_train_over, y_train_over)

# # Membuat prediksi
# y_pred_rf = rf_clf.predict(X_test)

# # Evaluasi model
# accuracy_rf = accuracy_score(y_test, y_pred_rf)
# report_rf = classification_report(y_test, y_pred_rf)

# print(f'RandomForest Accuracy: {accuracy_rf}')
# print(f'RandomForest Classification Report:\n{report_rf}')


In [None]:
# import optuna
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import train_test_split

# # Memisahkan variabel independen dan dependen
# X = df.drop(columns=['churn'])
# y = df['churn']

# # Membagi data menjadi train dan tes
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Fungsi objektif untuk keseluruhan pembelajaran model
# def objective(trial):
#     try:
#         # Hyperparameter terbaik akan ditemukan dari sini
#         param = {
#             'n_estimators': trial.suggest_int('n_estimators', 50, 200),
#             'max_depth': trial.suggest_int('max_depth', 5, 15),
#             'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
#             'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 10),
#             'max_features': trial.suggest_float('max_features', 0.5, 1.0),
#             'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
#         }

#         # Melatih model
#         rf_clf = RandomForestClassifier(**param, random_state=42)
#         rf_clf.fit(X_train, y_train)

#         # Melakukan Prediksi
#         preds = rf_clf.predict(X_test)
        
#         # Menghitung Akurasi
#         accuracy = accuracy_score(y_test, preds)
#         return 1.0 - accuracy  # Minimalkan 1 - akurasi karena Optuna mencari nilai minimum

#     except Exception as e:
#         print(e)
#         return np.inf  # Mengembalikan nilai yang besar untuk menunjukkan kegagalan

# # Memastikan objek pembelajaran oleh Optuna
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100)

# # Parameter Terbaik
# best_params = study.best_params
# print('Best parameters:', best_params)

# # Melatih model dengan parameter terbaik
# best_rf_clf = RandomForestClassifier(**best_params, random_state=42)
# best_rf_clf.fit(X_train, y_train)

# # Melakukan prediksi pada data uji
# y_pred = best_rf_clf.predict(X_test)

# # Evaluasi model
# accuracy = accuracy_score(y_test, y_pred)
# print(f'Accuracy: {accuracy}')


In [None]:
# # Membuat prediksi dengan model yang dioptimalkan
# y_pred_optimized = randomized_search.predict(X_test)

# # Evaluasi model
# accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
# report_optimized = classification_report(y_test, y_pred_optimized)

# print(f'Optimized RandomForest Accuracy: {accuracy_optimized}')
# print(f'Optimized RandomForest Classification Report:\n{report_optimized}')


In [None]:

submission['churn'] = y_pred
submission.churn.value_counts()

submission.to_csv("C:/IRIS COMPETITION/predict12.csv", index=False)

In [None]:
print(submission.index)
