# **DATASET**

* **ID : Unique Identifier for a row** 

* **Gender: Gender of the Customer** 

* **Age : Age of the Customer (in Years)**

* **Region_Code : Code of the Region for the customers** 

* **Occupation : Occupation Type for the customer** 

* **Channel_Code : Acquisition Channel Code for the Customer (Encoded)** 

* **Vintage : Vintage for the Customer (In Months), Number of Days, Customer has been associated with the company** 

* **Credit_Product : If the Customer has any active credit product (Home loan, Personal loan, Credit Card etc.)** 

* **Avg_Account_Balance : Average Account Balance for the Customer in last 12 Months**

* **Is_Active : If the Customer is Active in last 3 Months** 

* **Is_Lead(Target) : If the Customer is interested for the Credit Card, 0 / 1: Customer is not interested / interested**

* **Things I have learned from this competition , please visit on this [topic](https://www.kaggle.com/discussion/242986) and provide your feedback**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd

In [None]:
df=pd.read_csv("../input/jobathon-may-2021-credit-card-lead-prediction/train.csv")

In [None]:
df.head()

# **OVERVIEW**

In [None]:
print("***** Shape of dataset is *****")
print()
df.shape   #shape of training data

In [None]:
print("***** Column names present in dataset *****")
print()
df.columns  #Column names

In [None]:
print("***** Checking if null values present or not *****")
print()
df.isnull().sum()   #Checking null values

In [None]:
print("**** Description of data *****")
print()
df.describe()   #Description of data

In [None]:
print("***** Value counts of target feature *****")
print()
df['Is_Lead'].value_counts()

In [None]:
columns=[ 'Gender', 'Region_Code', 'Occupation', 'Channel_Code',
        'Credit_Product', 'Is_Active','Is_Lead']

for i in columns:
    print("Value counts of",i,"is :")
    print(df[i].value_counts())
    print()
    print("Unique values of",i,"is :",df[i].unique()) 
    print()

# **VISUALIZATIONS**

In [None]:
df.head()

### **As we can see data is imbalanced**

In [None]:
def with_hue(data,feature,ax):
    
    #Numnber of categories
    num_of_cat=len([x for x in data[feature].unique() if x==x])
    
    bars=ax.patches
    
    for ind in range(num_of_cat):
        ##     Get every hue bar
        ##     ex. 8 X categories, 4 hues =>
        ##    [0, 8, 16, 24] are hue bars for 1st X category
        hueBars=bars[ind:][::num_of_cat] 
        # Get the total height (for percentages)
        total=sum([x.get_height() for x in hueBars])
        #Printing percentages on bar
        for bar in hueBars:
            percentage='{:.1f}%'.format(100 * bar.get_height()/total)
            ax.text(bar.get_x()+bar.get_width()/2.0,
                   bar.get_height(),
                   percentage,
                    ha="center",va="bottom",fontweight='bold',fontsize=14)
    

    
def without_hue(data,feature,ax):
    
    total=float(len(data))
    bars_plot=ax.patches
    
    for bars in bars_plot:
        percentage = '{:.1f}%'.format(100 * bars.get_height()/total)
        x = bars.get_x() + bars.get_width()/2.0
        y = bars.get_height()
        ax.text(x, y,(percentage,bars.get_height()),ha='center',fontweight='bold',fontsize=14)


In [None]:
sns.set_theme(context="notebook",style="white",font_scale=2)
fig=plt.figure(figsize=(15,7))

#Setting plot and background color
ax = plt.axes() 
ax.set_facecolor("#F2EDD7FF") 
fig.patch.set_facecolor("#F2EDD7FF")

#Dealing with spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.grid(linestyle="--",axis='y',color='gray')

a=sns.countplot(data=df,x='Is_Lead',palette='rocket_r')
without_hue(df,'Is_Lead',a)

In [None]:
nrows=5
ncols=2
f,ax=plt.subplots(nrows=nrows,ncols=ncols,figsize=(20,34))
columns=['Gender','Occupation', 'Channel_Code','Credit_Product', 'Is_Active']

f.patch.set_facecolor('#F2EDD7FF')

    #Setting background and foreground color
for i in range(0,nrows):
  for j in range(0,ncols):
    ax[i][j].set_facecolor('#F2EDD7FF')
    ax[i][j].spines['top'].set_visible(False)
    ax[i][j].spines['right'].set_visible(False)
    ax[i][j].spines['left'].set_visible(False)
    ax[i][j].grid(linestyle="--",axis='y',color='gray')


    if(j==0):
      a1=sns.countplot(data=df,x=columns[i-1],palette='rocket_r',ax=ax[i][j])
      without_hue(df,columns[i-1],a1)
    elif(j==1):
      a2=sns.countplot(data=df,x=columns[i-1],palette='rocket_r',ax=ax[i][j],hue="Is_Lead")
      with_hue(df,columns[i-1],a2)



In [None]:
nrows=3
ncols=2
f,ax=plt.subplots(nrows=nrows,ncols=ncols,figsize=(25,34))
columns=["Age","Vintage","Avg_Account_Balance"]

f.patch.set_facecolor('#F2EDD7FF')

    #Setting background and foreground color
for i in range(0,nrows):
  for j in range(0,ncols):
    ax[i][j].set_facecolor('#F2EDD7FF')
    ax[i][j].spines['top'].set_visible(False)
    ax[i][j].spines['right'].set_visible(False)
    ax[i][j].spines['left'].set_visible(False)
    ax[i][j].grid(linestyle="--",axis='y',color='gray')


    if(j==0):
      a1=sns.boxplot(data=df,x=(df[columns[i-1]]),palette='rocket_r',ax=ax[i][j])
    elif(j==1):
      a2=sns.histplot(data=df,x=(df[columns[i-1]]),palette='rocket_r',ax=ax[i][j],hue="Is_Lead",kde=True)

In [None]:
sns.set_theme(context="notebook",style="white",font_scale=2)
fig=plt.figure(figsize=(15,7))

#Setting plot and background color
ax = plt.axes() 
ax.set_facecolor("#F2EDD7FF") 
fig.patch.set_facecolor("#F2EDD7FF")

#Dealing with spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.grid(linestyle="--",axis='y',color='gray')

plt.text(10,3300,"Log Distribution of\nAvg_Account_Balance",fontweight='bold')
a=sns.histplot(data=df,x=np.log(df['Avg_Account_Balance']),palette='rocket_r',kde=True)


In [None]:
xvars=['Age',"Vintage",'Avg_Account_Balance']
yvars=['Age','Vintage',"Avg_Account_Balance"]
g=sns.pairplot(data=df,x_vars=xvars,y_vars=yvars,palette="rocket_r")
g.fig.set_size_inches(15,15)


# **PREPROCESSING**

In [None]:
#Changing the distribution of 'Avg_Account_Balance' into Log Distribution
df['Avg_Account_Balance']=np.log(df['Avg_Account_Balance'])

#### **Removing outliers using quartile method**

In [None]:

'''df_acc=sorted(df['Avg_Account_Balance'])
Q1,Q3=np.percentile(df_acc,[25,75])
IQR= Q3-Q1
lower_range= Q1-(1.5*IQR)
upper_range=Q3+(1.5*IQR)

print("Lower range of outliers : ",lower_range)
print("Upper range of outliers : ",upper_range)
df_lower_outliers=df[df['Avg_Account_Balance']<lower_range]
df_upper_outliers=df[df['Avg_Account_Balance']>upper_range]

print("***** Lower outliers of acc *****")
print()
df_lower_outliers

print(df_upper_outliers.shape)
print()
print("**** Outer outliers of acc *****")
print()
df_upper_outliers

df1=df.drop(df[df['Avg_Account_Balance']>upper_range].index)
df1=df.drop(df[df['Avg_Account_Balance']<lower_range].index)'''

##### **Curently going forward without dropping outliers : because removing outliers may cause loss of important data**

In [None]:
df1=df.copy()

##### **Dropping "ID" Column**

In [None]:
dropping_columns=['ID']
df1=df1.drop(dropping_columns,axis=1)

#### **Importing necessary libraries**

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
import optuna
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score,train_test_split , StratifiedKFold
from sklearn.metrics import roc_auc_score , f1_score , confusion_matrix , classification_report

##### **We will take "nan" value present in "Credit_Product" as another class and replace all "nan" values with "Missing" Keyword**

In [None]:
#df1["Credit_Product"]=df1['Credit_Product'].fillna(df1['Credit_Product'].dropna().mode()[0])
df1['Credit_Product'].replace(np.nan,'Missing',inplace=True)

##### **Will do label encoding of all categorical columns**

In [None]:
#Label Encoding
label_encode=LabelEncoder()
columns1=['Gender', 'Is_Active','Occupation', 'Channel_Code','Credit_Product',"Region_Code"]
df1[columns1]=df1[columns1].apply(label_encode.fit_transform)
df1.head()

In [None]:
#One hot encoding
label_encode=LabelEncoder()
columns1=['Gender', 'Is_Active']
columns2=['Occupation', 'Channel_Code','Credit_Product']
df1[columns1]=df1[columns1].apply(label_encode.fit_transform)
df1=pd.get_dummies(data=df1,columns=columns2)
df1.head()

In [None]:
#Standard Scaling
ss=StandardScaler()
columns3=['Age','Vintage','Avg_Account_Balance']
df1[columns3]=ss.fit_transform(df1[columns3])
df1.head()

#### **Not any major correlation between two features so we will go ahead with all features and start from basic**

In [None]:
fig=plt.figure(figsize=(20,10))
ax = plt.axes() 
ax.set_facecolor("#F2EDD7FF") 
fig.patch.set_facecolor("#F2EDD7FF")

sns.heatmap(data=df1.corr(),annot=True,linewidth=3)

In [None]:
Y=df1['Is_Lead']
X=df1.drop("Is_Lead",axis=1)

In [None]:
Y.value_counts()

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=42)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

##### **Undersampling and oversampling code but we will move ahead without implementation of these techniques and observe how our models will work**

In [None]:
#Oversampling
smt=SMOTE()
x_samp,y_samp=smt.fit_resample(x_train,y_train)

#UnderSampling
'''from imblearn.under_sampling import RandomUnderSampler
us=RandomUnderSampler(random_state=42)


# **LOGISTIC REGRESSION**

In [None]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression(solver="liblinear", random_state=42)
cv=StratifiedKFold(n_splits=5,random_state=42,shuffle=True)
ans=cross_val_score(clf,x_train,y_train,cv=cv,n_jobs=2,scoring='roc_auc').mean()
print("Logistic Regression :" , ans)

##### **Training of Model**

In [None]:
model_log=clf.fit(x_train,y_train)

##### **Prediction**

In [None]:
pred_log=model_log.predict_proba(x_test)[:,1]

In [None]:
print("***** roc-auc-score of logistic regression *****")
print()
print(roc_auc_score(y_test,pred_log))

# **Decision Tree Classifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
def objective(trial):
    
    #n_estimators = trial.suggest_int('n_estimators', 2, 200)
    max_depth = int(trial.suggest_int('max_depth', 1,100))
    clf = DecisionTreeClassifier(max_depth=max_depth)
    cv=StratifiedKFold(n_splits=5,random_state=42,shuffle=True)
    return cross_val_score(clf, x_train, y_train,n_jobs=2, cv=cv,scoring='roc_auc').mean()

##### **Hyperparameter tuning using optuna**

In [None]:
study = optuna.create_study(direction='maximize',study_name='Decision Trees')
study.optimize(objective, n_trials=15)

In [None]:
trial = study.best_trial
print('## best_value -->',trial.value)
print("## best_parameters -->",trial.params)

In [None]:
model_dt=DecisionTreeClassifier(max_depth=10)

model_dt.fit(x_train,y_train)

In [None]:
y_pred_dt=model_dt.predict_proba(x_test)[:,1]

In [None]:
print("***** roc_auc_score of decision tree classifier *****")
print()
print(roc_auc_score(y_test,y_pred_dt))

# **K NEAREST NEIGHBOURS**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
def objective(trial):
    
    #n_estimators = trial.suggest_int('n_estimators', 2, 200)
    n_neighbors = int(trial.suggest_int('n_neighbors', 1,500))
    clf = KNeighborsClassifier(n_neighbors=n_neighbors)
    cv=StratifiedKFold(n_splits=5,random_state=42,shuffle=True)

    return cross_val_score(clf, x_train, y_train, 
           n_jobs=2, cv=cv,scoring='roc_auc').mean()

In [None]:
study = optuna.create_study(direction='maximize',study_name='KNN')
study.optimize(objective, n_trials=15)

In [None]:
trial = study.best_trial
print('## best_values -->',trial.value)
print("## best_parameters -->",trial.params)

In [None]:
model_k=KNeighborsClassifier(n_neighbors=43)

In [None]:
model_k.fit(x_train,y_train)

In [None]:
y_pred_k=model_k.predict_proba(x_test)[:,1]

In [None]:
print(roc_auc_score(y_test,y_pred_k))

# **RANDOM FOREST CLASSIFIER**

In [None]:
def objective(trial):
    
    n_estimators = trial.suggest_int('n_estimators', 2, 200)
    max_depth = int(trial.suggest_int('max_depth', 1, 40))
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    cv=StratifiedKFold(n_splits=5,random_state=42,shuffle=True)

    return cross_val_score(clf, x_train, y_train, 
           n_jobs=2, cv=cv,scoring='roc_auc').mean()

In [None]:
study = optuna.create_study(direction='maximize',study_name='Random Forest')
study.optimize(objective, n_trials=10)

In [None]:
trial = study.best_trial
print('## best_values -->',trial.value)
print("## best_parameters -->",trial.params)

In [None]:
model_rf=RandomForestClassifier(n_estimators=174,max_depth=10)

In [None]:
model_rf.fit(x_train,y_train)

In [None]:
pred_train=model_rf.predict_proba(x_test)[:,1]


In [None]:
print(roc_auc_score(y_test,pred_train))

In [None]:
feature_importance = np.array(model_rf.feature_importances_)
feature_names = np.array(x_train.columns)
data={'feature_names':feature_names,'feature_importance':feature_importance}
df_plt = pd.DataFrame(data)
df_plt.sort_values(by=['feature_importance'], ascending=False,inplace=True)
plt.figure(figsize=(10,8))
sns.barplot(x=df_plt['feature_importance'], y=df_plt['feature_names'])
#plt.style.use("ggplot")
plt.xlabel('FEATURE IMPORTANCE')
plt.ylabel('FEATURE NAMES')
plt.show()

# **LGBM**

In [None]:
import lightgbm as lgb

In [None]:
def objective_lgbm(trial):
    
    n_estimators = trial.suggest_int('n_estimators', 2, 500)
    max_depth = int(trial.suggest_int('max_depth', 2, 50))
    learning_rate=trial.suggest_loguniform('learning_rate',0.001,1)
    colsample_bytree=trial.suggest_loguniform("colsample_bytree",0.1, 1)
    num_leaves=trial.suggest_int('num_leaves',10,300)
    reg_alpha= trial.suggest_loguniform('reg_alpha',0.1,10)
    reg_lambda= trial.suggest_loguniform('reg_lambda',0.1,10)
    min_split_gain=trial.suggest_loguniform('min_split_gain',0.1,1)
    subsample=trial.suggest_loguniform('subsample',0.1,1)    
    clf = lgb.LGBMClassifier(n_estimators=n_estimators, max_depth=max_depth,
                            learning_rate=learning_rate,colsample_bytree=colsample_bytree,
                            num_leaves=num_leaves,reg_alpha=reg_alpha,reg_lambda=reg_lambda,
                            min_split_gain=min_split_gain,subsample=subsample)
    cv=StratifiedKFold(n_splits=5,random_state=42,shuffle=True)
    return cross_val_score(clf, x_train, y_train, 
           n_jobs=2, cv=5,scoring='roc_auc').mean()

In [None]:
study_lgbm= optuna.create_study(direction='maximize',study_name="LGBM")
study_lgbm.optimize(objective_lgbm, n_trials=15)

In [None]:
trial_lgbm= study_lgbm.best_trial
print("## Accuracy --> ",trial_lgbm.value)
print("## Best parameters --> ",trial_lgbm.params)

In [None]:
#MODEL
model_lgbm=lgb.LGBMClassifier(**trial_lgbm.params)
model_lgbm.fit(x_train,y_train,eval_metric="auc",eval_set=[(x_test,y_test)],early_stopping_rounds=100,verbose=400)

In [None]:
pred_lgbm=model_lgbm.predict_proba(x_test)[:,1]

In [None]:
print("***** roc_auc_score of LGBM *****")
print()
print(roc_auc_score(y_test,pred_lgbm))

### **Out of all classifiers LGBM gives the highest roc_auc_score**
### **If you have any suggestion please tell me in the comment section**
### **There are many experiments we can do on this dataset but I build a baseline notebook:)**
### **Also I have made a topic on things I have learned from this competition link is [Here](https://www.kaggle.com/discussion/242986) , please have a look and share your feedback**


### **Don't forget to visit my other notebooks too and your feedback is appreciated**

1. [Crime Against Women in India](https://www.kaggle.com/aryanml007/crime-against-women-in-india-2001-to-2014)
2. [Stroke Prediction Analysis](https://www.kaggle.com/aryanml007/stroke-prediction-analysis-auc-0-90)
3. [Heart Attack Analysis](https://www.kaggle.com/aryanml007/heart-attack-analysis-visualizations)

4. [TPS April 2021](https://www.kaggle.com/aryanml007/manual-encoding-optuna-tps-april)

5. [Fetal Health classification](https://www.kaggle.com/aryanml007/fetal-health-classification)

6. [Student performance analysis](https://www.kaggle.com/aryanml007/students-performance-analysis)

7. [Plant Disease detection](https://www.kaggle.com/aryanml007/plant-disease-resnet50)

8. [Vehicle Insurance](https://www.kaggle.com/aryanml007/vehicle-insurance)


In [None]:
def cross_val(X, y, model, params, folds=9):

    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=21)
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        x_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        x_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

        alg = model(**params)
        alg.fit(x_train, y_train,
                eval_set=[(x_test, y_test)],
                early_stopping_rounds=100,
                verbose=400)

        pred = alg.predict_proba(x_test)[:, 1]
        roc_score = roc_auc_score(y_test, pred)
        print(f"roc_auc_score: {roc_score}")
        #print("-"*50)
    
    return alg

In [None]:
 lgbm_params= {'learning_rate': 0.045, 
             'n_estimators': 20000, 
             'max_bin': 94,
             'num_leaves': 10, 
             'max_depth': 27, 
             'reg_alpha': 8.457, 
             'reg_lambda': 6.853, 
             'subsample': 0.749}

In [None]:
from lightgbm import LGBMClassifier
lgb_model = cross_val(X,Y,LGBMClassifier, lgbm_params)