In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [24]:
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

### feature explanations
id - Unique ID. We can use this features as a count for other parameters.

CustomerID - Customer ID is not a unique ID, there are few customer IDs which have more than 80 repetitive occurrences.

Surname - This is the surname of the customer, there are too many repititions. Actually there are only around 2700 unique surnames.

Credit Score - Your creditworthiness is rated by a three-digit figure called a credit score. 300 to 850 is the range of FICO scores. You have a better chance of getting approved for loans and better prices the higher your score. Now in our dataset there are range of values starting from 350 and going up to 850. Now this can be a very useful information while thinking about the churning.

Geography - There are 3 unique values - France, Spain, and Germany. One has to use Label Encoder or OneHotEncoder to encode these values.

Gender - There are only 2 unique values - Male and Female. Here a label binarizer is enough for the encoding purposes.

Age - Depicts the age of the customers. There are all possible values starting from 18 up to 92. There are 2 anomalies found in the age column - there 2 values in float - 32.44 and 36.44. It would be better if we can round those values to 32 and 36 respectively.

Tenure - It might show from how many years the customer has been related to the bank or may be vice versa. There are values ranging from 0 to 10. Most probably these values are in years.

Balance - This is the bank balance of the customer. There were many doubts in the discussion forum that the bank balance was 0. When I performed the analysis, I found that actually 89000+ people had 0 bank balance. While the maximum amount recorded was around 250,000.

Number of Products - Now this can be a very difficult question. While there are only 4 unique values possible - 1, 2, 3, and 4. This can be attributes to how many major/big products the customer owns. Or other explanation might be that how many products the customer has bought on loan.

Has Credit Card - Clear cut, whether the customer has a credit card or not. Same goes for the next column as well Is Active Member. **Joke - I have read in one of the famous investor's article - Credit card is actually for those people who do not need it. **

Estimated Salary - What is the estimated salary of the individual. Now, this is a very important aspect of the real life scenario. Whenever you are given a credit from the bank, they mostly ask for whether or not you are salaried. If you are estimated of getting a higher salary, easier for them to credit you a higher amount of loan.



In [10]:
df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [20]:
def age_tr(df) : 
    df['Age_Category'] = pd.cut(df['Age'], bins=[18, 30, 40, 50, 60, 100], labels=['18-30', '30-40', '40-50', '50-60', '60+'],include_lowest = True)
    return df

df = age_tr(df)
test_df = age_tr(test_df)


In [25]:
from sklearn.preprocessing import LabelEncoder

# Drop the specified columns from test_df and df
columns_to_drop = ["id", "CustomerId", "Surname"]
test_df.drop(columns_to_drop, axis=1, inplace=True)
df.drop(columns_to_drop, axis=1, inplace=True)


In [26]:
# Fit the LabelEncoder on the "Geography" column
geo_encoder = LabelEncoder()
geo_encoder.fit(pd.concat([df["Geography"], test_df["Geography"]]))

# Transform the "Geography" column in the training and test data
df["Geography"] = geo_encoder.transform(df["Geography"])
test_df["Geography"] = geo_encoder.transform(test_df["Geography"])

# Fit the LabelEncoder on the "Gender" column
gender_encoder = LabelEncoder()
gender_encoder.fit(pd.concat([df["Gender"], test_df["Gender"]]))

# Transform the "Gender" column in the training and test data
df["Gender"] = gender_encoder.transform(df["Gender"])
test_df["Gender"] = gender_encoder.transform(test_df["Gender"])

In [27]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

# Split the data into features and target variable
X = df.drop('Exited', axis=1)  # Replace 'Exited' with the actual name of your target variable column
y = df['Exited']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the decision tree model
model = DecisionTreeClassifier()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Calculate accuracy and ROC score
accuracy = accuracy_score(y_test, y_pred)
roc_score = roc_auc_score(y_test, y_pred)

accuracy, roc_score


(0.7953161450601388, 0.700858383879258)