In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

from datetime import datetime
from datetime import date

In [36]:
df_referrer = pd.read_csv("../inter_data/referrer.csv")
df_non_referrer = pd.read_csv("../inter_data/non_referrer.csv")

#### adding target variable

In [37]:
df_referrer["target"] = [1]*len(df_referrer)
df_non_referrer["target"] = [0]*len(df_non_referrer)

In [38]:
df_model = pd.concat([df_referrer, df_non_referrer], axis = 0)

In [39]:
df_model = df_model[[col for col in df_model.columns if "Unnamed" not in  col ]]

In [40]:
df_model

Unnamed: 0,cm_cd,cm_mobile,refer_count,activation_count,IncomePA,cm_occup,E2Channel,IsKarvy,KRA YN,age,months_since_last_login,months_since_last_trade,dp_holding,total_logins_one_year,total_trades_one_year,target
0,76556800,6000015546,1,0.0,5_10L,P,CAT,0,N,22.0,1.993668,132.000000,0.0,10.0,0.0,1
1,49282624,6000016229,1,0.0,<=1L,S,CAT,0,Y,22.0,1.040877,1.730829,0.0,35.0,1.0,1
2,60000270,6000027031,1,0.0,<=1L,P,CAT,0,N,22.0,1.336571,1.270861,0.0,4.0,1.0,1
3,64606053,6000031654,31,23.0,<=1L,S,CAT,0,N,32.0,0.515198,3.373573,0.0,208.0,5.0,1
4,86330038,6000037524,3,0.0,<=1L,S,CAT,0,Y,23.0,1.238006,2.847895,0.0,69.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29491,11279751,8272852863,0,0.0,5_10L,S,CAT,0,N,35.0,2.223657,132.000000,0.0,7.0,0.0,0
29492,11279904,8003213638,0,0.0,<=1L,P,No Channel,0,N,25.0,3.964966,132.000000,0.0,19.0,0.0,0
29493,1127JHUN,9785580171,0,0.0,1_5L,S,Franchisee,0,Y,27.0,1.763689,1.040881,0.0,67.0,6.0,0
29494,11280086,7620644208,0,0.0,5_10L,Professional,Franchisee,0,Y,27.0,3.899256,4.260660,0.0,190.0,58.0,0


In [41]:
df_model = df_model.sample(frac=1)


In [42]:
df_model.to_csv("../inter_data/preprocessed.csv")

In [4]:
df_model = pd.read_csv("../inter_data/preprocessed.csv")
df_model = df_model[[col for col in df_model.columns if "Unnamed" not in  col ]]

In [5]:
def feature_engineering_catboost(df):
   
    # handing null values(continuos values)
    del df["cm_mobile"]
    del df["activation_count"]
    del df["refer_count"]
    df["months_since_last_login"] = df["months_since_last_login"].fillna(df["months_since_last_login"].mean())
    df["months_since_last_trade"] = df["months_since_last_trade"].fillna(df["months_since_last_trade"].mean())
    
    
    # categorical data
    df["KRA YN"] = df["KRA YN"].fillna("N")
    
    income_slabs = list(df["IncomePA"].value_counts().index[:5])
    df["IncomePA"] = df["IncomePA"].replace({"5_10L ??": "5_10L", 
                                        "<=1??": "<=1L", 
                                        "1_5??": "1_5L", "<= 1L": "<=1L"})
    df['IncomePA'].loc[~df['IncomePA'].isin(income_slabs)] = 'others'
    
    e2channels = list(df["E2Channel"].value_counts().index[:5])
    df['E2Channel'].loc[~df['E2Channel'].isin(e2channels)] = 'others'
    
    occups = list(df["cm_occup"].value_counts().index[:5])
    df['cm_occup'].loc[~df['cm_occup'].isin(occups)] = 'others'
    
    df['IsKarvy'] = df['IsKarvy'].fillna(0)
    
    
   
    # scaling numerical columns
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    numerical_cols = ["age","months_since_last_login","months_since_last_trade", "total_logins_one_year", 
                      "total_trades_one_year"]

    df[numerical_cols] = df[numerical_cols].fillna(0)

    return df


In [6]:
df_model = feature_engineering_catboost(df_model.copy())

In [7]:
df_model

Unnamed: 0,cm_cd,IncomePA,cm_occup,E2Channel,IsKarvy,KRA YN,age,months_since_last_login,months_since_last_trade,dp_holding,total_logins_one_year,total_trades_one_year,target
0,10501992,1_5L,S,others,0,N,29.0,1.533705,17.796872,,7.0,0.0,0
1,11154521,<=1L,P,CAT,0,N,23.0,132.000000,132.000000,0.0,0.0,0.0,0
2,SHIRAZUI,5_10L,P,CAT,0,Y,21.0,6.133384,0.515198,0.0,13.0,1.0,1
3,11975396,1_5L,S,CAT,0,N,44.0,132.000000,132.000000,0.0,0.0,0.0,0
4,MSHAHRO7,<=1L,S,CAT,0,N,22.0,20.359548,132.000000,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62877,92516982,1_5L,S,CAT,0,N,35.0,132.000000,132.000000,0.0,0.0,1.0,1
62878,17482977,5_10L,P,CAT,0,N,72.0,0.515198,0.745183,0.0,1400.0,56.0,1
62879,70629679,<=1L,S,CAT,0,Y,19.0,0.778038,0.810892,0.0,6.0,1.0,1
62880,114847,others,others,Brp,0,N,121.0,132.000000,132.000000,,0.0,0.0,0


In [None]:
df_model.to_csv("../inter_data/processed_catboost.csv")