# Import Data

In [11]:
import pandas as pd

train = pd.read_csv('train.csv').set_index('customer_id')
test = pd.read_csv('test.csv').set_index('customer_id')
submission = pd.read_csv('sample_submission.csv')

# Data Understanding

In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3000 entries, 15782993 to 15706268
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   credit_score      3000 non-null   int64  
 1   country           3000 non-null   object 
 2   gender            3000 non-null   object 
 3   age               3000 non-null   int64  
 4   tenure            3000 non-null   int64  
 5   balance           3000 non-null   float64
 6   products_number   3000 non-null   int64  
 7   credit_card       3000 non-null   int64  
 8   active_member     3000 non-null   int64  
 9   estimated_salary  3000 non-null   float64
dtypes: float64(2), int64(6), object(2)
memory usage: 257.8+ KB


In [13]:
train.isna().sum()

credit_score          0
country             155
gender                0
age                 241
tenure              288
balance             117
products_number     148
credit_card         159
active_member       137
estimated_salary    145
churn                 0
dtype: int64

In [14]:
test.isna().sum()

credit_score        0
country             0
gender              0
age                 0
tenure              0
balance             0
products_number     0
credit_card         0
active_member       0
estimated_salary    0
dtype: int64

# Data Preprocessing

### Handling Missing Values

In [15]:
train['country'].fillna(train['country'].mode()[0], inplace=True)
numerical_columns = ['age', 'tenure', 'balance', 'products_number', 'estimated_salary']
train[numerical_columns] = train[numerical_columns].apply(lambda x: x.fillna(x.median()), axis=0)
train['credit_card'].fillna(train['credit_card'].mode()[0], inplace=True)
train['active_member'].fillna(train['active_member'].mode()[0], inplace=True)

### Handling Type Data

In [16]:
col_int = ['age','tenure','products_number','credit_card','active_member']
train[col_int] = train[col_int].astype('int64')

### Feature Encoding

In [17]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in train.columns:
    if train[col].dtype == 'object':
        train[col] = le.fit_transform(train[col])
        test[col] = le.transform(test[col])

### Handling Imbalance Data

In [18]:
X_train = train.drop(['churn'],axis=True)
y_train = train.churn.values

In [19]:
from imblearn.over_sampling import SMOTENC

categorical_features = ['country', 'gender']
over = SMOTENC(random_state=42,categorical_features=categorical_features)
X_train_over, y_train_over = over.fit_resample(X_train, y_train)

### Feature Scaling

In [20]:
from sklearn.preprocessing import PowerTransformer

numerical_columns = ['age', 'tenure', 'balance', 'products_number', 'estimated_salary']

power_transformer = PowerTransformer(method='yeo-johnson')

X_train_over[numerical_columns] = power_transformer.fit_transform(X_train_over[numerical_columns])
test[numerical_columns] = power_transformer.fit_transform(test[numerical_columns])


# Modeling

### AutoML

In [None]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

X_train_lazy, X_val, y_train_lazy, y_val= train_test_split(X_train_over, y_train_over,test_size=.2,random_state =123)

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train_lazy, X_val, y_train_lazy, y_val)

print(models)

### XGB

In [77]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(random_state=42, verbosity=0)

xgb_clf.fit(X_train_over, y_train_over)

y_pred_xgb = xgb_clf.predict(test)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [78]:
submission['churn'] = y_pred_xgb

submission.churn.value_counts()

# submission.to_csv('predict1.csv', index=False)

churn
0    2073
1     927
Name: count, dtype: int64