In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
credit = pd.read_csv('../input/uci-credit-card-2/credit_card (1).csv')
credit

In [None]:
credit.info()

In [None]:
credit.describe(include="all").T

In [None]:
#Searching for duplicates
duplicates = credit.duplicated()
print('Number of duplicate rows = %d' % (duplicates.sum()))
credit[duplicates]

****Data Pre-processing

In [None]:
# Renaming the column names with their corresponding month
credit.columns = ['ID', 'LIMIT_BAL', 'GENDER', 'EDUCATION', 'MARRIAGE', 'AGE', 
                  'PAY_SEP', 'PAY_AUG', 'PAY_JUL', 'PAY_JUN', 'PAY_MAY', 'PAY_APR',
                 'BILL_AMT_SEP', 'BILL_AMT_AUG', 'BILL_AMT_JUL', 'BILL_AMT_JUN', 'BILL_AMT_MAY', 'BILL_AMT_APR',
                 'PAY_AMT_SEP', 'PAY_AMT_AUG', 'PAY_AMT_JUL', 'PAY_AMT_JUN', 'PAY_AMT_MAY', 'PAY_AMT_APR',
                 'DEFAULT']

In [None]:
credit.describe().T

In [None]:
## Converting Categorical columns in to object datatype
credit['GENDER']= credit['GENDER'].astype(object)
credit['EDUCATION']= credit['EDUCATION'].astype(object)
credit['MARRIAGE']= credit['MARRIAGE'].astype(object)
credit['PAY_SEP']= credit['PAY_SEP'].astype(object)
credit['PAY_AUG']= credit['PAY_AUG'].astype(object)
credit['PAY_JUL']= credit['PAY_JUL'].astype(object)
credit['PAY_JUN']= credit['PAY_JUN'].astype(object)
credit['PAY_MAY']= credit['PAY_MAY'].astype(object)
credit['PAY_APR']= credit['PAY_APR'].astype(object)
credit['DEFAULT']= credit['DEFAULT'].astype(object)

In [None]:
credit.info()

In [None]:
#Check and Treat Bad & Missing Data
credit[credit.AGE < 17]

In [None]:
credit[credit.AGE > 90]

In [None]:
# replacing the bad data in Age field as NAN
credit.AGE.replace(to_replace=-20, value=np.NaN, inplace= True)
credit.AGE.replace(to_replace=120, value=np.NaN, inplace= True)

In [None]:
credit.AGE.describe()

In [None]:
fig,axs = plt.subplots(nrows=1,ncols=2, figsize=(10,5))
sns.histplot(credit.AGE, kde=True, ax=axs[0])
sns.boxplot(x='AGE',data=credit, ax=axs[1])

### From the above plot the Age field is right skewed and has outliers
### Hence, if we have to impute values, median will be the right statistics to do so.

In [None]:
#Let us check how many values are missing
credit.isnull().sum()


In [None]:
#Extractin the null rows
credit[credit.AGE.isnull()]

In [None]:
sns.boxplot(x='DEFAULT', y = 'AGE', data=credit)

### From the above plot we observe that there is no significant difference. Hence, we can impute the missing values with overall median value rather than picking up separate median values for DEFAULT=0 and DEFAULT=1

In [None]:
credit.AGE = credit.AGE.fillna(credit.AGE.median())
credit[credit.AGE.isnull()]

In [None]:
credit.MARRIAGE.value_counts()

In [None]:
credit.GENDER.value_counts()

### No mention of Marital Status being 0. Lets see how this field influences the dependent variable 'DEFAULT'

In [None]:
pd.crosstab(credit.DEFAULT, credit.MARRIAGE, margins=True, normalize=True)

### Only 0.18% (54) of the overall records has a value of 0 If we impute this with mode value of 2, there will be a slight increase in the proportion of level 2 As we have 30000 records in hand out of which only 54 have 0 values, we can go ahead and drop these rows instead of changing the data

In [None]:
credit = credit[credit.MARRIAGE != 0]

In [None]:
#As we have removed few rows from the dataframe, the index is not continuous
#Need to reset the index to get the continuous values

credit.reset_index(drop=True, inplace=True) 

In [None]:
credit.head()

In [None]:
credit.tail()

In [None]:
credit.MARRIAGE.value_counts()

In [None]:
credit.EDUCATION.value_counts()
#5 and 6 correspond to 'Unknown'. So we can replace all 6 to 5. There are only 14 records with 0 value. We can replace even 0 to 'Unknown' category as currently we dont know what it stands for. In future, if data for 'Unknown' Education has to be found, the ones with value 0 will also be included.

In [None]:
credit.EDUCATION.replace(to_replace=6, value=5, inplace=True)
credit.EDUCATION.replace(to_replace=0, value=5, inplace=True)

In [None]:
credit.EDUCATION.value_counts()

In [None]:
# Defining a function to retrieve count of levels within each category
def get_level_counts(i):
    print(credit[i].value_counts())

In [None]:
PAY_X = ['PAY_APR','PAY_MAY','PAY_JUN','PAY_JUL','PAY_AUG','PAY_SEP']

In [None]:
for f in PAY_X:
    get_level_counts(f)

In [None]:
credit.DEFAULT.value_counts(normalize=True)
#replace the values of some of the categorical variable with meaningful labels so it eases our EDA process and helps in getting insights.

In [None]:
credit.GENDER.replace(to_replace=1, value='M', inplace=True)
credit.GENDER.replace(to_replace=2, value='F', inplace=True)

In [None]:
credit.EDUCATION.replace(to_replace=1, value='Grad', inplace=True)
credit.EDUCATION.replace(to_replace=2, value='Univ', inplace=True)
credit.EDUCATION.replace(to_replace=3, value='HSchool', inplace=True)
credit.EDUCATION.replace(to_replace=4, value='Others', inplace=True)
credit.EDUCATION.replace(to_replace=5, value='Unknown', inplace=True)

In [None]:
credit.MARRIAGE.replace(to_replace=1,value='Married', inplace=True)
credit.MARRIAGE.replace(to_replace=2,value='Single', inplace=True)
credit.MARRIAGE.replace(to_replace=3,value='Others', inplace=True)

### Anomalies Check