# Step 1: Load the Dataset

In [1]:
import pandas as pd

df = pd.read_pickle("churn_prediction_v3.pkl")

In [2]:
df.head()

Unnamed: 0,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,current_balance,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
0,2401,66,Male,0,self_employed,2002,2,755,1458.71,1458.71,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0.0
1,2648,35,Male,0,self_employed,100000,2,3214,5390.37,8704.66,7799.26,12419.41,0.56,0.56,5486.27,100.56,6496.78,8787.61,0.0
2,2494,31,Male,0,salaried,146,2,41,3913.16,5815.29,4910.17,2815.94,0.61,0.61,6046.73,259.23,5006.28,5070.14,0.0
3,2629,90,Male,1,self_employed,1020,2,582,2291.91,2291.91,2084.54,1006.54,0.47,0.47,0.47,2143.33,2291.91,1669.79,1.0
4,1879,42,Male,2,self_employed,1494,3,388,927.72,1401.72,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15929 entries, 0 to 15928
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   vintage                         15929 non-null  int64   
 1   age                             15929 non-null  int64   
 2   gender                          15929 non-null  category
 3   dependents                      15929 non-null  int64   
 4   occupation                      15929 non-null  category
 5   city                            15929 non-null  category
 6   customer_nw_category            15929 non-null  category
 7   branch_code                     15929 non-null  category
 8   current_balance                 15929 non-null  float64 
 9   previous_month_end_balance      15929 non-null  float64 
 10  average_monthly_balance_prevQ   15929 non-null  float64 
 11  average_monthly_balance_prevQ2  15929 non-null  float64 
 12  current_month_cred

In [4]:
# average_monthly_balance_prevQ >>> Target varaible

In [7]:
# Steps for Model Building
# 0. Convert your Business problem into a Data Problem
# 1. Load the dataset
# 2. Exploratory Data Analysis:
# 3. Bare minimum data cleaning (e.g. missing value imputation) & 
# preprocessing (e.g. Encoding the cat. variables)

# 4. Create Baseline models

# 5. Data Polishing/Refinement: e.g. outlier treatment, scaling the num. features,
# Feature engineering, feature selection, feature transformation, etc.

# ===> Data is FInalized!!
# 6. Compare and Select the best model(s) for "tuning" 

# 7. Hyper-parameter tuning / Model Refinement

# 8. Re-train you final model on the entire data, one last time!

# 9. Save the model (pickelize)

# 10. Share/Deploy the model

# y = f(X)

# Model = Algo(Data)


# Step 3: Data Cleaning & Basic PRe-processing

In [312]:
X = df.drop('average_monthly_balance_prevQ', axis=1)  # Features matrix
y = df['average_monthly_balance_prevQ']   # Target variable for Regression

In [313]:
X.shape, y.shape

((15929, 18), (15929,))

In [314]:
pd.get_dummies(X['occupation'], drop_first=True)   # OHE the occupation

Unnamed: 0,retired,salaried,self_employed,student
0,0,0,1,0
1,0,0,1,0
2,0,1,0,0
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
15924,0,1,0,0
15925,0,0,1,0
15926,0,1,0,0
15927,0,0,1,0


In [315]:
categorical_cols = df.dtypes[df.dtypes == 'category'].index  # identifying the categorical variable
categorical_cols

Index(['gender', 'occupation', 'city', 'customer_nw_category', 'branch_code',
       'churn'],
      dtype='object')

In [316]:
for col in categorical_cols:  # list the number of unique values in these cat. columns
    print(col, ":", X[col].nunique())

gender : 2
occupation : 5
city : 30
customer_nw_category : 3
branch_code : 2806
churn : 2


In [317]:
X.drop( 'branch_code', axis=1, inplace=True)  # Dropping the columns, as it has just too many unique values

In [318]:
X['gender'] = X['gender'].map({'Male':0, 'Female':1})  # Label Encoding the gender col. This can also be One-hot encoded

In [319]:
nominal_cols = ['occupation', 'city', 'customer_nw_category'] # Selecting out the columns for OHE

In [320]:
X_new = pd.get_dummies(X, columns=nominal_cols) # One-Hot encoding the Nominal columns

In [321]:
X_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15929 entries, 0 to 15928
Data columns (total 52 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   vintage                         15929 non-null  int64   
 1   age                             15929 non-null  int64   
 2   gender                          15929 non-null  category
 3   dependents                      15929 non-null  int64   
 4   current_balance                 15929 non-null  float64 
 5   previous_month_end_balance      15929 non-null  float64 
 6   average_monthly_balance_prevQ2  15929 non-null  float64 
 7   current_month_credit            15929 non-null  float64 
 8   previous_month_credit           15929 non-null  float64 
 9   current_month_debit             15929 non-null  float64 
 10  previous_month_debit            15929 non-null  float64 
 11  current_month_balance           15929 non-null  float64 
 12  previous_month_bal

## Step 4: Create Baseline Model

In [322]:
# Split your data into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((12743, 52), (3186, 52))

In [323]:
# All the ML algorithms are referred as "estimators" in the sklearn package...

# and all thse estimators are actually Python "Classes" .... they will have attributes & methods
# Attributes stores some properties/values
# Methods >> functions which will used to "operate" on the data 
# Since all the estimators are Classes... they have to be instantiated, before they can be used.

In [324]:
X_train

Unnamed: 0,vintage,age,gender,dependents,current_balance,previous_month_end_balance,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,...,city_1589,city_2001,city_2002,city_2012,city_2013,city_2014,city_100000,customer_nw_category_1,customer_nw_category_2,customer_nw_category_3
6682,2110,50,1,0,987.17,487.17,1984.86,500.13,428.70,0.13,...,0,0,0,0,0,0,0,1,0,0
1665,2617,62,0,1,10579.41,8357.16,1632.61,2244.06,7572.83,21.81,...,0,0,0,0,0,0,0,0,0,1
13644,2531,62,0,2,1598.32,2108.68,1995.06,0.60,5714.89,1510.74,...,0,1,0,0,0,0,0,0,1,0
7142,2610,64,1,2,1890.72,1754.15,1581.80,142.89,0.03,292.03,...,0,0,0,0,0,0,0,0,1,0
2582,2544,27,0,0,9223.24,9223.24,14710.91,0.34,37478.57,0.34,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13418,2574,69,0,3,22548.28,21727.52,37843.84,1057.12,544.53,554.41,...,0,0,0,0,0,1,0,0,1,0
5390,2561,52,0,1,96.57,854.81,3567.15,0.21,0.21,12282.29,...,0,0,0,0,0,0,0,1,0,0
860,2366,62,0,0,3170.19,3170.19,2828.30,0.53,0.53,0.53,...,0,0,1,0,0,0,0,0,0,1
15795,2657,63,1,0,4127.81,3494.05,2560.00,1702.10,49.79,1429.23,...,0,0,0,0,0,1,0,0,1,0


In [325]:
from sklearn.linear_model import LinearRegression

LR = LinearRegression()  # "untrained" model
LR.fit(X_train, y_train)  # model is trained now  >>> fitting means model training here.
# Always fir your model on the training set

In [326]:
LR.predict(X_test)   # predictions of the model on the test set

array([3058.77170518, 4649.1980646 , 7039.50748672, ..., 1916.29008386,
       5011.83248721, 1890.25483129])

In [327]:
LR.score(X_train, y_train)   # R^2 score of the model on the training set

0.9980574988739739

In [328]:
LR.score(X_test, y_test)   # R^2 score of the model on the TEST set

0.9702114747635675

In [329]:
LR.coef_  # Model Coiefficients / parameters of the model

array([ 4.99222201e-03,  2.23918858e+00,  2.10856158e+01,  1.58166408e+01,
        8.25068376e-02, -7.06788652e-02,  5.21063638e-02, -2.42189208e-03,
       -2.25759258e-02, -6.88786270e-03,  1.58753943e-02,  1.72352175e-01,
        7.60218558e-01,  4.74376341e+02, -1.34755630e+02,  1.15559280e+01,
       -1.26786084e+01,  6.95973208e+01,  6.62809896e+01,  5.52359037e+01,
       -1.69755760e+02,  8.87227192e+01, -3.76462994e+01,  2.66536665e+02,
       -1.36460350e+02,  2.91442926e+02,  1.45085813e+02, -5.58967023e+01,
        1.32313059e+01,  5.43029228e+01, -1.17191367e+02, -2.85439550e+02,
       -7.15245638e+01,  8.52737696e+01, -1.05059368e+02,  1.68154865e+01,
       -1.15614558e+02, -3.34627337e+01,  1.88862641e+02, -9.99805712e+01,
       -2.21416526e+02, -2.43724576e+01,  9.52031466e+01,  2.14796802e+01,
        1.69731653e+00,  3.56453440e+02, -4.17768228e+01, -4.07619931e+01,
       -1.23984112e+02,  1.08144067e+02, -2.58443812e+01, -8.22996859e+01])

In [330]:
LR.intercept_  # the final intercept term

-108.16271925706042

In [331]:
# Set the display option
pd.set_option('display.float_format', '{:.2f}'.format)

In [332]:
model_coef = pd.DataFrame(LR.coef_, index=X_new.columns, columns=['Model Coefficients'])
model_coef

Unnamed: 0,Model Coefficients
vintage,0.0
age,2.24
gender,21.09
dependents,15.82
current_balance,0.08
previous_month_end_balance,-0.07
average_monthly_balance_prevQ2,0.05
current_month_credit,-0.0
previous_month_credit,-0.02
current_month_debit,-0.01


# Step 5: Data Refinement
Let's do scaling and see if there is any further improvement in the model

In [333]:
# Scaling
SC = StandardScaler()

X_train_std = SC.fit_transform(X_train)  # Standardized Training set
X_test_std = SC.transform(X_test) # Standardized Test set

In [334]:
LR_std = LinearRegression()  # "untrained" model
LR_std.fit(X_train_std, y_train)  # model is trained now  on the standardized dataset

In [335]:
LR_std.score(X_train_std, y_train)    # R^2 score of the model on the standardized training set

0.9980574549012659

In [336]:
LR_std.score(X_test_std, y_test)  # R^2 score of the model on the standardized test set

0.9702217632643674

In [337]:
# Concl: There is no change for Linear REgression model. (which is as expected)