In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv("/kaggle/input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv")

In [None]:
df.head()

In [None]:
df.iloc[0:1,:].transpose()

#### From the first glance  we can see that PAY_5 and PAY_6 have -2.0 values which shoudnt be possible as the minimum vlaue to which they can go is -1
#### Let us explore the dataset more 

In [None]:
df.info()

#### Its a relief that there arent any missing values 
#### Also we can see that there are no categorical variables in this 


In [None]:
df.describe().transpose()

### From Careful Examination of the Data:
* Marriage has a maximum value of 3.0 which shouldnt be possible 
* PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6 have a minimum value -2.0
* BILL_AMT1, BILL_AMT2, BILL_AMT3, BILL_AMT4, BILL_AMT5, BILL_AMT6 have a -ve value 

### In this project we try to predict in advance whether the person will have a default payement next month 
#### We should be able to answer questions like 
* How does the probability of default payment vary by categories of different demographic variables?
* Which variables are the strongest predictors of default payment?


### The feature which we try to predict is  "default.payment.next.month". So lets examine this feature

In [None]:
df["default.payment.next.month"].unique()

### As expected we have 0 and 1
* Where 1 means the person will have default payement next month 
* And 0 means the person will have paid his dues next month

In [None]:
df["default.payment.next.month"].value_counts()

In [None]:
fig=plt.figure(figsize=(7,7))
sns.countplot(x="default.payment.next.month",data=df,palette="magma")

#### This dataset looks unbalanced
#### Before we build a model we should make sure that this feature is balanced 

In [None]:
fig=plt.figure(figsize=(20,15))
axes=fig.add_axes([0,0,0.8,0.8])
sns.heatmap(df.corr(),annot=True,cmap="viridis")

## Now let us perform some EDA to know our Dataset better

 ### **LIMIT_BAL**

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
dist=sns.distplot(df["LIMIT_BAL"],color="purple")
axes.ticklabel_format(style="plain",axis="x")

#### We can see that the LIMIT_BAL is mostly between 100,000 dollars 
#### We also can see a very few people have a LIMIT_BAL of 1000,000 dollars which has made the dataset skewed. Removal of some outliers would be helpfull

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
box=sns.boxplot(y="LIMIT_BAL",x="default.payment.next.month",palette="viridis",data=df)


#### Defaulters tend to have a slightly lower LIMIT_BAL

In [None]:
df["LIMIT_BAL"].nunique()

In [None]:
df["LIMIT_BAL"].nunique()

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
sns.countplot(x="LIMIT_BAL",order=df["LIMIT_BAL"].value_counts().head(5).index,data=df,palette="ocean_r")

#### The top 5 amounts of credit given to the customers

In [None]:
df[df["LIMIT_BAL"]> 150000]["default.payment.next.month"].value_counts()

In [None]:
df[df["LIMIT_BAL"]< 150000]["default.payment.next.month"].value_counts()

 ### **SEX**

#### Let us see the distribution of sex among the data

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
sns.countplot(x="SEX",data=df,palette="cividis")

#### This dataset has more female clients than male clients

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
box=sns.violinplot(y="LIMIT_BAL",x="SEX",palette="rocket",data=df)


#### The LIMIT_BAL seems to be almost similar among the two sexes

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
box=sns.boxplot(y="LIMIT_BAL",x="SEX",palette="cubehelix",data=df,hue="default.payment.next.month")


#### The default payement and Limit balance is almost the same between both the sexes
### So, Sex wont help us that much in identfying a defaulter

### **EDUCATION**

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
sns.countplot(x="EDUCATION",data=df,palette="Reds")

In [None]:
df["EDUCATION"].value_counts()

#### There is no 0 class mentioned in the description of the dataset.
#### Class 5 and 6 both denote unknown.Combining them would be a good idea 
#### As there are less than 30 entries it would be better to drop the rows having 0 as Education


In [None]:
df.shape

In [None]:
df=df.drop(df[df["EDUCATION"]==0].index)

In [None]:
df.shape

In [None]:
def com(x):
    if x==5 or x==6:
        return 5
    else:
        return x

df["EDUCATION"]=df["EDUCATION"].apply(lambda x:com(x))

In [None]:
df["EDUCATION"].value_counts()

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
sns.countplot(x="EDUCATION",data=df,palette="Reds")

### Now this looks better 

In [None]:
li=[]
for i in df["EDUCATION"].unique():
    a=(len(df[(df["EDUCATION"]==i) & (df["default.payment.next.month"]==1)].index)/len(df[df["EDUCATION"]==i].index)*100)
    li.append(a)

In [None]:
ed=pd.DataFrame({"Education":df["EDUCATION"].unique(),"% of Defaulters":li})

In [None]:
ed.set_index("Education")

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
sns.barplot(x="Education",y="% of Defaulters",data=ed,palette="Blues")

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
box=sns.boxplot(y="LIMIT_BAL",x="EDUCATION",palette="bwr",data=df,hue="default.payment.next.month")


 ### **AGE**

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
dist=sns.distplot(df["AGE"],color="green",bins=45)
axes.ticklabel_format(style="plain",axis="x")

### The owner of the credit card are mostly between 25-35.
#### It is intresting to see people over 80 using credit cards

In [None]:
sns.jointplot(y="LIMIT_BAL",x="AGE",data=df)

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
box=sns.boxplot(y="AGE",x="default.payment.next.month",palette="afmhot",data=df)

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
box=sns.boxplot(y="AGE",x="default.payment.next.month",palette="tab20",data=df,hue="EDUCATION")

#### Nothing much can be inferred from this graph as AGE is almost similar for both the labels. So this feature aint that helpfull

### **PAY_0**

In [None]:
df["PAY_0"].unique()

In [None]:
df["PAY_0"].value_counts()

### since -1 means the client has paid upto date bills and 1 means the client has due of 1 month, 0 and -2 are completely useless so let us map 0 and -2 to -1

In [None]:
li='PAY_0	PAY_2	PAY_3	PAY_4	PAY_5	PAY_6	'.split()

In [None]:
def com(x):
    if x==-2 or x==0:
        return -1
    else:
        return x

for i in li:
    df[i]=df[i].apply(lambda x:com(x))

In [None]:
df["PAY_0"].value_counts()

In [None]:
 sns.countplot(x="PAY_0",data=df)

#### As we saw earlier there were a quite less number of defaulters when compared to the clients who paid thier bills
#### This graph confirms it as we see there are quite a lot clients Repayment status -1(duly paid) in September, 2005 

In [None]:
li=[]
for i in df["PAY_0"].unique():
    a=(len(df[(df["PAY_0"]==i) & (df["default.payment.next.month"]==1)].index)/len(df[df["PAY_0"]==i].index)*100)
    li.append(a)

In [None]:
ed=pd.DataFrame({"PAY_0":df["PAY_0"].unique(),"% of Defaulters":li})
ed.set_index("PAY_0")

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
sns.barplot(x="PAY_0",y="% of Defaulters",data=ed,palette="twilight")

### As expected the more the months u dont pay your dues the better chance there is of u becoming a deafulter 
#### Now lets see if the trend is same in other months (PAY_2,PAY_3....)

In [None]:
li=[]
for i in df["PAY_2"].unique():
    a=(len(df[(df["PAY_2"]==i) & (df["default.payment.next.month"]==1)].index)/len(df[df["PAY_2"]==i].index)*100)
    li.append(a)

ed=pd.DataFrame({"PAY_2":df["PAY_2"].unique(),"% of Defaulters":li})
ed.set_index("PAY_2")

fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
sns.barplot(x="PAY_2",y="% of Defaulters",data=ed,palette="twilight")

In [None]:
li=[]
for i in df["PAY_3"].unique():
    a=(len(df[(df["PAY_3"]==i) & (df["default.payment.next.month"]==1)].index)/len(df[df["PAY_3"]==i].index)*100)
    li.append(a)

ed=pd.DataFrame({"PAY_3":df["PAY_3"].unique(),"% of Defaulters":li})
ed.set_index("PAY_3")

fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
sns.barplot(x="PAY_3",y="% of Defaulters",data=ed,palette="twilight")

In [None]:
li=[]
for i in df["PAY_4"].unique():
    a=(len(df[(df["PAY_4"]==i) & (df["default.payment.next.month"]==1)].index)/len(df[df["PAY_4"]==i].index)*100)
    li.append(a)

ed=pd.DataFrame({"PAY_4":df["PAY_4"].unique(),"% of Defaulters":li})
ed.set_index("PAY_4")

fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
sns.barplot(x="PAY_4",y="% of Defaulters",data=ed,palette="twilight")

In [None]:
li=[]
for i in df["PAY_5"].unique():
    a=(len(df[(df["PAY_5"]==i) & (df["default.payment.next.month"]==1)].index)/len(df[df["PAY_5"]==i].index)*100)
    li.append(a)

ed=pd.DataFrame({"PAY_5":df["PAY_5"].unique(),"% of Defaulters":li})
ed.set_index("PAY_5")

fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
sns.barplot(x="PAY_5",y="% of Defaulters",data=ed,palette="twilight")

In [None]:
li=[]
for i in df["PAY_6"].unique():
    a=(len(df[(df["PAY_6"]==i) & (df["default.payment.next.month"]==1)].index)/len(df[df["PAY_6"]==i].index)*100)
    li.append(a)

ed=pd.DataFrame({"PAY_6":df["PAY_6"].unique(),"% of Defaulters":li})
ed.set_index("PAY_6")

fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
sns.barplot(x="PAY_6",y="% of Defaulters",data=ed,palette="twilight")

#### These results correspond with PAY_0
#### A common trend i could observe in these graphs is the derease in percentage of defaultees in Class 5 after Class 4 
#### The reason for this could be after 4 months of payment due, the bank could start warning people abut thier credit card bill and people might have paid thier bills(Just an intuition)

### **MARRIAGE**

In [None]:
df["MARRIAGE"].unique()

In [None]:
df["MARRIAGE"].value_counts()

#### we still dont know what class ) and 3 means 

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
sns.countplot(x="MARRIAGE",data=df,palette="icefire")

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
sns.boxplot(x="MARRIAGE",y="LIMIT_BAL",data=df)

### So these class 3 people in marriage seem to have a very less LIMIT_BAL compared to the rest of the classes
####  lesser the limit balance more the probability of them being defaulters,So according to this intiution class 3 should have more number of defaultees.Lets check this theory 

In [None]:
li=[]
for i in df["MARRIAGE"].unique():
    a=(len(df[(df["MARRIAGE"]==i) & (df["default.payment.next.month"]==1)].index)/len(df[df["MARRIAGE"]==i].index)*100)
    li.append(a)

ed=pd.DataFrame({"MARRIAGE":df["MARRIAGE"].unique(),"% of Defaulters":li})
ed.set_index("MARRIAGE")

fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
sns.barplot(x="MARRIAGE",y="% of Defaulters",data=ed,palette="Wistia")

### As expected class 3 had the highest %age of defaultees
#### Now lets compare this feature with age 

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
sns.boxplot(x="MARRIAGE",y="AGE",data=df)

#### As expected class 2(Single People) have a lower average age compared to class 1(Married) but nothing can be inferred about class 3 and class 0 
#### Class 3 seems to have a higher average age than other classes

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
sns.boxplot(x="MARRIAGE",y="BILL_AMT1",data=df)

#### Presence of Outliers are ruining this so let us remove them to get a clear view of our graph
#### But there are about 500 instances where "BILL_AMT1" <0 


In [None]:
df[df["BILL_AMT1"]<0]["PAY_0"].value_counts()

### Here we see a remarkable feature
* If "BILL_AMT1"<0, PAY_0 is only -1 or 1 
* The lesser the value of PAY_0 the better the chance that the person isnt a defaulter,So does having a negative BILL_AMT1 decrease the chances of him being an defaulter?
* Let us study in detail 

In [None]:
df.shape

In [None]:
len(df[df["BILL_AMT1"]<0])

In [None]:
(len(df[(df["BILL_AMT1"]<0) & (df["default.payment.next.month"]) == 1])/len(df[df["BILL_AMT1"]<0]))*100

#### Percentage of people who have a negative bill amount and default thier payments

In [None]:
(len(df[(df["BILL_AMT1"]>=0) & (df["default.payment.next.month"]) == 1])/len(df[df["BILL_AMT1"]>=0]))*100

#### Percentage of people who have a positive bill amount and default thier payments

#### There isnt a huge difference.Lets examine this feature some more 

In [None]:
fig=plt.figure(figsize=(10,7))
sns.jointplot(x="BILL_AMT1",y="LIMIT_BAL",data=df,kind="hex")

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
sns.boxplot(x="default.payment.next.month",y="BILL_AMT1",data=df)

#### nothing much can be said from this at all
#### let us check if this is similar for other BILL_AMTx(BILL_AMT2	BILL_AMT3	BILL_AMT4	BILL_AMT5	BILL_AMT6)

In [None]:
bi="BILL_AMT2	BILL_AMT3	BILL_AMT4	BILL_AMT5	BILL_AMT6	".split()

In [None]:
f, axes = plt.subplots(2,3,figsize=(20,10))
sns.boxplot(x="default.payment.next.month",y="BILL_AMT1",data=df,ax=axes[0][0])
sns.boxplot(x="default.payment.next.month",y="BILL_AMT2",data=df ,ax=axes[0][1])
sns.boxplot(x="default.payment.next.month",y="BILL_AMT3",data=df ,ax=axes[0][2])
sns.boxplot(x="default.payment.next.month",y="BILL_AMT4",data=df ,ax=axes[1][0])
sns.boxplot(x="default.payment.next.month",y="BILL_AMT5",data=df ,ax=axes[1][1])
sns.boxplot(x="default.payment.next.month",y="BILL_AMT6",data=df ,ax=axes[1][2])
plt.tight_layout()

#### Everything looks mostly similar and nothing can be distinguished 

### **PAY_AMT1**

In [None]:
df["PAY_AMT1"].value_counts()

In [None]:
len(df[df["PAY_AMT1"]<0])

In [None]:
len(df[(df["PAY_AMT1"]==0) & (df["default.payment.next.month"]==1)])/len(df[(df["PAY_AMT1"]==0)])*100

In [None]:
len(df[(df["PAY_AMT1"]>0) & (df["default.payment.next.month"]==1)])/len(df[(df["PAY_AMT1"]>0)])*100

In [None]:
print(len(df[(df["PAY_AMT1"]==0) & (df["default.payment.next.month"]==1)])/len(df[(df["PAY_AMT1"]==0)])*100)
print(len(df[(df["PAY_AMT1"]>0) & (df["default.payment.next.month"]==1)])/len(df[(df["PAY_AMT1"]>0)])*100)
print("\n")
print(len(df[(df["PAY_AMT2"]==0) & (df["default.payment.next.month"]==1)])/len(df[(df["PAY_AMT2"]==0)])*100)
print(len(df[(df["PAY_AMT2"]>0) & (df["default.payment.next.month"]==1)])/len(df[(df["PAY_AMT2"]>0)])*100)
print("\n")
print(len(df[(df["PAY_AMT3"]==0) & (df["default.payment.next.month"]==1)])/len(df[(df["PAY_AMT3"]==0)])*100)
print(len(df[(df["PAY_AMT3"]>0) & (df["default.payment.next.month"]==1)])/len(df[(df["PAY_AMT3"]>0)])*100)
print("\n")
print(len(df[(df["PAY_AMT4"]==0) & (df["default.payment.next.month"]==1)])/len(df[(df["PAY_AMT4"]==0)])*100)
print(len(df[(df["PAY_AMT4"]>0) & (df["default.payment.next.month"]==1)])/len(df[(df["PAY_AMT4"]>0)])*100)
print("\n")
print(len(df[(df["PAY_AMT5"]==0) & (df["default.payment.next.month"]==1)])/len(df[(df["PAY_AMT5"]==0)])*100)
print(len(df[(df["PAY_AMT5"]>0) & (df["default.payment.next.month"]==1)])/len(df[(df["PAY_AMT5"]>0)])*100)
print("\n")
print(len(df[(df["PAY_AMT6"]==0) & (df["default.payment.next.month"]==1)])/len(df[(df["PAY_AMT6"]==0)])*100)
print(len(df[(df["PAY_AMT6"]>0) & (df["default.payment.next.month"]==1)])/len(df[(df["PAY_AMT6"]>0)])*100)

### This is almost twice the chance of a person being a defaulter if he he has a PAY_AMT1=0

In [None]:
f, axes = plt.subplots(2,3,figsize=(20,10))
sns.boxplot(x="default.payment.next.month",y="PAY_AMT1",data=df,ax=axes[0][0])
sns.boxplot(x="default.payment.next.month",y="PAY_AMT2",data=df ,ax=axes[0][1])
sns.boxplot(x="default.payment.next.month",y="PAY_AMT3",data=df ,ax=axes[0][2])
sns.boxplot(x="default.payment.next.month",y="PAY_AMT4",data=df ,ax=axes[1][0])
sns.boxplot(x="default.payment.next.month",y="PAY_AMT5",data=df ,ax=axes[1][1])
sns.boxplot(x="default.payment.next.month",y="PAY_AMT6",data=df ,ax=axes[1][2])
plt.tight_layout()

#### This data is very skewed 
#### let us remove outliers later so that we see how the graph is 

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
sns.distplot(df["PAY_AMT1"],color="red")

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
sns.scatterplot(x="PAY_AMT1",y="LIMIT_BAL",data=df)

In [None]:
X=df[['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']]

In [None]:
y = df.iloc[:,-1] 

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
#target column i.e price range
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=f_classif, k=23)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(16,'Score'))  #print 10 best features

#taken from https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e

## DATA CLEANING

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
dist=sns.distplot(df["LIMIT_BAL"],color="purple")
axes.ticklabel_format(style="plain",axis="x")

In [None]:
df[df["LIMIT_BAL"]>750000]

In [None]:
df.shape

In [None]:
df.drop((df[df["LIMIT_BAL"]>750000]).index,inplace=True)

In [None]:
df.shape

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
dist=sns.distplot(df["BILL_AMT1"],color="orange")
axes.ticklabel_format(style="plain",axis="x")

In [None]:
df[df["BILL_AMT1"]>620000]

In [None]:
df.shape

In [None]:
df.drop((df[df["BILL_AMT1"]>620000]).index,inplace=True)

In [None]:
df.shape

In [None]:
df[df["BILL_AMT2"]>620000]

In [None]:
df.drop((df[df["BILL_AMT2"]>620000]).index,inplace=True)

In [None]:
df.shape

In [None]:
df[df["BILL_AMT3"]>620000]

In [None]:
df.drop((df[df["BILL_AMT3"]>620000]).index,inplace=True)

In [None]:
df.shape

In [None]:
df.drop((df[df["BILL_AMT4"]>620000]).index,inplace=True)

In [None]:
df.shape

In [None]:
df[df["BILL_AMT5"]>620000]

In [None]:
df.drop((df[df["BILL_AMT5"]>620000]).index,inplace=True)

In [None]:
df.shape

In [None]:
df[df["BILL_AMT6"]>620000]

In [None]:
df.drop((df[df["BILL_AMT6"]>620000]).index,inplace=True)

In [None]:
df.shape

In [None]:
fig=plt.figure(figsize=(10,7))
axes=fig.add_axes([0,0,0.8,0.8])
dist=sns.distplot(df["PAY_AMT1"],color="BLACK")
axes.ticklabel_format(style="plain",axis="x")

In [None]:
df[df["PAY_AMT1"]>400000]

In [None]:
df.drop(df[df["PAY_AMT1"]>400000].index,inplace=True)

In [None]:
df.shape

In [None]:
df.drop(df[df["PAY_AMT2"]>400000].index,inplace=True)

In [None]:
df.shape

In [None]:
df[df["PAY_AMT3"]>400000]

In [None]:
df.drop(df[df["PAY_AMT3"]>400000].index,inplace=True)

In [None]:
df.shape

In [None]:
df[df["PAY_AMT4"]>400000]

In [None]:
df.drop(df[df["PAY_AMT4"]>400000].index,inplace=True)

In [None]:
df.shape

In [None]:
df[df["PAY_AMT5"]>400000]

In [None]:
df.drop(df[df["PAY_AMT5"]>400000].index,inplace=True)

In [None]:
df[df["PAY_AMT6"]>400000]

In [None]:
df.drop(df[df["PAY_AMT5"]>400000].index,inplace=True)

In [None]:
df.shape

### CONSTRUCTING A MODEL WITHOUT ONE HOT ENCODING

In [None]:
df.columns

In [None]:
X=df[[ 'LIMIT_BAL',  'SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3','PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']].values

In [None]:
y=df['default.payment.next.month'].values

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
std=StandardScaler()

In [None]:
std.fit(X_train)

In [None]:
X_train=std.transform(X_train)

In [None]:
X_test=std.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.svm import SVC
svc=SVC()

In [None]:
rfc=RandomForestClassifier(n_estimators=200)

In [None]:
rfc.fit(X_train,y_train)

In [None]:
predict=rfc.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(classification_report(y_test,predict))

In [None]:
print(confusion_matrix(y_test,predict))

In [None]:
svc.fit(X_train,y_train)

In [None]:
predict2=svc.predict(X_test)

In [None]:
print(classification_report(y_test,predict2))

In [None]:
print(confusion_matrix(y_test,predict2))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn=KNeighborsClassifier(n_neighbors=1)

In [None]:
knn.fit(X_train,y_train)

In [None]:
pred4=knn.predict(X_test)

In [None]:
print(classification_report(y_test,pred4))

In [None]:
error_rate=[]
for i in range(1,40):
    knn=KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    predict=knn.predict(X_test)
    error_rate.append(np.mean(predict!= y_test))

In [None]:
plt.plot(range(1,40),error_rate)

In [None]:
a=np.min(error_rate)
error_rate.index(a)

In [None]:
knn=KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train,y_train)
pred5=knn.predict(X_test)
print(confusion_matrix(y_test,pred5))

In [None]:
print(classification_report(y_test,pred5))