In [40]:
import pandas as pd

In [41]:
filename = "test.csv"
df2 = pd.read_csv(filename)

In [42]:
list(df2)

['Loan_ID',
 'Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Property_Area']

In [43]:
df2.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome      int64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
dtype: object

In [44]:
#We need to change any object datatypes into int
#this is our categorical data
obj_df2 = df2.select_dtypes(include=['object']).copy()
obj_df2.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,Urban
1,LP001022,Male,Yes,1,Graduate,No,Urban
2,LP001031,Male,Yes,2,Graduate,No,Urban
3,LP001035,Male,Yes,2,Graduate,No,Urban
4,LP001051,Male,No,0,Not Graduate,No,Urban


In [45]:
#shows how many missing values in each category
obj_df2.isnull().sum()

Loan_ID           0
Gender           11
Married           0
Dependents       10
Education         0
Self_Employed    23
Property_Area     0
dtype: int64

In [46]:
#Rename columns to something more suitable for when our data is encoded
obj_df2 = obj_df2.rename(columns={'Gender': 'Gender_Male', 'Education':'Graduate'})

In [47]:
obj_df2.describe()

Unnamed: 0,Loan_ID,Gender_Male,Married,Dependents,Graduate,Self_Employed,Property_Area
count,367,356,367,357,367,344,367
unique,367,2,2,4,2,2,3
top,LP001380,Male,Yes,0,Graduate,No,Urban
freq,1,286,233,200,283,307,140


In [48]:
#For categorical data we will replace NaN values with top value from each column
#This is our "Legend" shows how the columns are encoded
cleanup_nums = {"Gender_Male":     {"Male": 1, "Female": 0},
                "Married": {"Yes":1, "No":0},
                "Graduate": {"Graduate":1, "Not Graduate":0},
                "Dependents": {"0": 0, "1": 1, "2": 2, "3+": 3},
                "Self_Employed": {"Yes":1, "No":0},
                "Property_Area": {"Semiurban":0, "Urban":1, "Rural":2},
               }

In [49]:
#Shows what the input for the column is, helped me encode each column
obj_df2["Gender_Male"].value_counts()
#obj_df["Married"].value_counts()
#obj_df["Graduate"].value_counts()
#obj_df["Dependents"].value_counts()
#obj_df["Self_Employed"].value_counts()
#obj_df["Property_Area"].value_counts()
#obj_df["Loan_Status_Approved"].value_counts()

Male      286
Female     70
Name: Gender_Male, dtype: int64

In [50]:
#We are replacing the NaN values with the top (encoded) result
values = {"Gender_Male" : 1, "Married" : 1, "Dependents" : 0, "Graduate" : 1, "Self_Employed" : 0, "Property_Area": 1}#, "Loan_Status": 1 }
obj_df2 = obj_df2.fillna(value=values)

In [51]:
obj_df2.replace(cleanup_nums, inplace=True)
obj_df2.head()

Unnamed: 0,Loan_ID,Gender_Male,Married,Dependents,Graduate,Self_Employed,Property_Area
0,LP001015,1,1,0,1,0,1
1,LP001022,1,1,1,1,0,1
2,LP001031,1,1,2,1,0,1
3,LP001035,1,1,2,1,0,1
4,LP001051,1,0,0,0,0,1


In [52]:
#Encoded data
obj_df2.head()

Unnamed: 0,Loan_ID,Gender_Male,Married,Dependents,Graduate,Self_Employed,Property_Area
0,LP001015,1,1,0,1,0,1
1,LP001022,1,1,1,1,0,1
2,LP001031,1,1,2,1,0,1
3,LP001035,1,1,2,1,0,1
4,LP001051,1,0,0,0,0,1


In [53]:
df2.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome      int64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
dtype: object

In [54]:
#We finsihed cleaning the objects, now we will clean the rest
alt_df2 = df2.select_dtypes(include=['int64','float64']).copy()
alt_df2.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,5720,0,110.0,360.0,1.0
1,3076,1500,126.0,360.0,1.0
2,5000,1800,208.0,360.0,1.0
3,2340,2546,100.0,360.0,
4,3276,0,78.0,360.0,1.0


In [55]:
alt_df2.isnull().sum()

ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
dtype: int64

In [56]:
alt_df2.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,367.0,367.0,362.0,361.0,338.0
mean,4805.599455,1569.577657,136.132597,342.537396,0.825444
std,4910.685399,2334.232099,61.366652,65.156643,0.38015
min,0.0,0.0,28.0,6.0,0.0
25%,2864.0,0.0,100.25,360.0,1.0
50%,3786.0,1025.0,125.0,360.0,1.0
75%,5060.0,2430.5,158.0,360.0,1.0
max,72529.0,24000.0,550.0,480.0,1.0


In [57]:
#For the numerical variables we will replace NaN with mean values
vals = {"LoanAmount" : 136, "Loan_Amount_Term" : 342, "Credit_History" : 1}
alt_df2 = alt_df2.fillna(value=vals)

In [58]:
#Now we will join the two cleaned dataframes, into one.
new_df2 = obj_df2.join(alt_df2)

In [59]:
new_df2.tail()

Unnamed: 0,Loan_ID,Gender_Male,Married,Dependents,Graduate,Self_Employed,Property_Area,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
362,LP002971,1,1,3,0,1,1,4009,1777,113.0,360.0,1.0
363,LP002975,1,1,0,1,0,1,4158,709,115.0,360.0,1.0
364,LP002980,1,0,0,1,0,0,3250,1993,126.0,360.0,1.0
365,LP002986,1,1,0,1,0,2,5000,2393,158.0,360.0,1.0
366,LP002989,1,0,0,1,1,2,9200,0,98.0,180.0,1.0


In [48]:
#export it to csv
new_df2.to_csv('cleaned_test.csv', index=False)