In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Reading Data

In [None]:
train_df = pd.read_csv("/kaggle/input/analytics-vidhya-job-a-thon-may-2021/train_s3TEQDk.csv")
test_df = pd.read_csv("/kaggle/input/analytics-vidhya-job-a-thon-may-2021/test_mSzZ8RL.csv")

In [None]:
train_df.info()

# Summary of Data

In [None]:
summary = pd.DataFrame(train_df.dtypes)
summary["unique"] = train_df.nunique()
summary["null_vals"] = train_df.isnull().sum()
summary["first"] = train_df.loc[0,:]
summary["second"] = train_df.loc[1,:]
summary["third"] = train_df.loc[2,:]
summary

In [None]:
train_df["Credit_Product"] = train_df["Credit_Product"].fillna("No")
test_df["Credit_Product"] = test_df["Credit_Product"].fillna("No")

Filled the Null values of column Cerdit product with zero because it makes sense for users who doesn't have credit product to leave the cell empty so filling it with 0

##### Dropping ID column

In [None]:
train_df = train_df.drop("ID", axis = 1)
test_df = test_df.drop("ID", axis = 1)

##### Getting Categorical and Numerical features

In [None]:
cat_vals = train_df.select_dtypes(include="object").columns.to_list()
num_vals = [cols for cols in train_df.columns.to_list() if cols not in cat_vals]

### Univariate Analysis

In [None]:
def get_row_col_idx(idx):
    """get the row and column index from the index for plots"""
    row_idx = idx//2
    col_idx = [0 if idx%2 == 0 else 1]
    return (row_idx, col_idx[0])

In [None]:
def  write_percent(ax):
    """Writes the percentage on top of the bar"""
    total_size = len(train_df)
    for patch in ax.patches:
        height = patch.get_height()
        width = patch.get_width()
        x_loc = patch.get_x()
        percent = (height/total_size) * 100
        ax.text(x_loc+ width/2.0, height, '{:1.1f}%'.format(percent), ha = "center")

In [None]:
target = train_df.pop("Is_Lead")

In [None]:
fig, axes = plt.subplots(3, 2, figsize = (20,20))
for idx, val in enumerate(train_df[cat_vals].columns.to_list()):
    row_idx, col_idx = get_row_col_idx(idx)
    sns.countplot(data = train_df,
                palette="Set3",
                x = val,
                ax = axes[row_idx, col_idx] )
    ax = axes[row_idx, col_idx]
    write_percent(ax)

From the above visuals we can conclude following things:
1. There are more number of males having the account
2. There are certain regsions whose number are high in the data
3. People are mostly self employed and very less number of people are enterpreneurs
4. Channel code mostly being used is X4
5. Very few customers(30%) have any active credit product (Home loan, Personal loan, Credit Card etc.)
6. Around 30% customer are Active in last 3 Months

#### Test data Distribution

In [None]:
fig, axes = plt.subplots(3, 2, figsize = (20,20))
for idx, val in enumerate(test_df[cat_vals].columns.to_list()):
    row_idx, col_idx = get_row_col_idx(idx)
    sns.countplot(data = test_df,
                palette="Set3",
                x = val,
                ax = axes[row_idx, col_idx] )
    ax = axes[row_idx, col_idx]
    write_percent(ax)

In [None]:
sns.catplot(x = target, data = train_df, kind = "count", height = 4, palette="Set3") 

#### Test and Train data both are having same distribution

In [None]:
fig, axes = plt.subplots(1,3, figsize=(30, 10))
for idx, val in enumerate(["Age", "Vintage", "Avg_Account_Balance"]):
    sns.histplot(x=val, data = test_df, bins=200, color = "red", ax = axes[idx])

In [None]:
fig, axes = plt.subplots(1,3, figsize=(30, 10))
for idx, val in enumerate(["Age", "Vintage", "Avg_Account_Balance"]):
    sns.histplot(x=val, data = train_df, bins=200, color = "red", ax = axes[idx])

We can see that the account balance is skewed towards right and mostly the people with age between 25 to 35 are there in the data and people above 60 are not too much in the data. We can see how vintage is varying.

# Multivariate Analysis

In [None]:
fig, axes = plt.subplots(3, 2, figsize = (20,20))
for idx, val in enumerate(train_df[cat_vals].columns.to_list()):
    row_idx, col_idx = get_row_col_idx(idx)
    sns.countplot(data = train_df,
                palette="Set3",
                x = val,
                hue = target,
                ax = axes[row_idx, col_idx] )
    ax = axes[row_idx, col_idx]
    write_percent(ax)

Enterpreneurs are having higher chances of being leads than other occupations while Salaried people have the lowest conversion rate to leads and it does make sense. People who are active are more likely to be the leads than the non active people. People with channel code X3 are having higher conversion rate to leads while X1 being the lowest. Also people having the credit products are likely to be the leads than the ones who are not having credit products.

In [None]:
fig, axes = plt.subplots(1,3, figsize=(30, 10))
for idx, val in enumerate(["Age", "Vintage", "Avg_Account_Balance"]):
    sns.histplot(x=val, data = train_df, bins=200, hue = target, color = "Pink", ax = axes[idx])

Although the people between age 25 and 35 are having more records but rate of them being leads is pretty less while people above that age have high chances of being lead. People with around vintage 90 are having the highest chances of being leads. People with avg_account_balance between 1 million to 2 million are having higher chances of being leads than others.

#### Dealing with non numeric values

In [None]:
def numerisize(feats, df):
    for feature in feats:
        df[feature] = df[feature].cat.codes

In [None]:
for cols in cat_vals:
    train_df[cols] = train_df[cols].astype("category")
    test_df[cols] = test_df[cols].astype("category")
    
numerisize(cat_vals, train_df)
numerisize(cat_vals, test_df)

In [None]:
corrMatrix = train_df.corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()

#### Splitting the data

In [None]:
# train_df = train_df.drop("Channel_Code", axis = 1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, confusion_matrix, roc_auc_score, recall_score, plot_roc_curve
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(train_df, target, random_state=0, stratify=target, test_size = 0.2)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
test_df = scaler.transform(test_df)

In [None]:
model= LogisticRegression()
model.fit(X_train, y_train)

print(roc_auc_score(y_train, model.predict_proba(X_train)[:, 1]))
print(roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))

# plot_roc_curve(y_train, y_pred_train)
# plot_roc_curve(y_test, y_pred)

In [None]:
# from sklearn.model_selection import GridSearchCV

# parameters = {
#     "n_estimators": [100, 150, 200], 
#     "max_depth": [5, 10, 15, 25],
#     "min_samples_leaf": [3, 5] 
# }

# model_random_forest = RandomForestClassifier(
#     random_state = 1,
#     class_weight='balanced',
# )

# model_random_forest = GridSearchCV(
#     model_random_forest, 
#     parameters, 
#     cv=5,
#     scoring='roc_auc',
#     n_jobs = -1
# )

# model_random_forest.fit(X_train, y_train)

# print('-----')
# print(f'Best parameters {model_random_forest.best_params_}')
# print(
#     f'Mean cross-validated accuracy score of the best_estimator: '+ \
#     f'{model_random_forest.best_score_:.3f}'
# )

In [None]:
model2 = RandomForestClassifier(max_depth=10, min_samples_leaf= 5, random_state=0, n_estimators = 200, n_jobs=-1)
model2.fit(X_train, y_train)

print(roc_auc_score(y_train, model2.predict_proba(X_train)[:, 1]))
print(roc_auc_score(y_test, model2.predict_proba(X_test)[:,1]))

In [None]:
result = model2.predict_proba(test_df)