In [2]:
from google.colab import data_table
data_table.enable_dataframe_formatter()

In [1]:
cd "/content/drive/MyDrive/Data Analytics/Datasets"

/content/drive/MyDrive/Data Analytics/Datasets


[Loan Dataset Description](https://drive.google.com/file/d/1DW1c3NA75Q3pDlvB-U2MYSuPOAcVETAx/view)

## Null Values

Problem Statement


Given the loans dataset, your task is to find out the percentage of missing values present in each column.

Expected Output
Print the percentage of the missing values in ascending order by rounding off the values to 2 decimal places.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('loan_data.csv')

print(round((df.isnull().sum()/len(df))*100,2).sort_values())

Loan_ID              0.00
Education            0.00
ApplicantIncome      0.00
CoapplicantIncome    0.00
Property_Area        0.00
Loan_Status          0.00
Married              0.49
Gender               2.12
Loan_Amount_Term     2.28
Dependents           2.44
LoanAmount           3.58
Self_Employed        5.21
Credit_History       8.14
dtype: float64


## Fill Missing Values

After analyzing the missing values present in each column, your task is to replace all the missing values present in the numerical features with the Mean for that particular column and for the categorical features, replace the missing values with the Mode for that particular column.

Expected Output
Print the number of missing values present in each column.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('loan_data.csv')

num = df.select_dtypes(include = 'number')

num.fillna(num.mean(), inplace = True)

df[num.columns] = num

cat = df.select_dtypes(include = 'object')

cat.fillna(cat.mode().iloc[0], inplace = True)

df[cat.columns] = cat

print(df.isnull().sum())

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


In [None]:
eg = {'a': [2, 1, 1],
      'b': [3, 2, 2]}

d = pd.DataFrame(eg)

d.mode()

Unnamed: 0,a,b
0,1,2


In [None]:
df.select_dtypes(include = 'number')

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,5849,0.0,,360.0,1.0
1,4583,1508.0,128.0,360.0,1.0
2,3000,0.0,66.0,360.0,1.0
3,2583,2358.0,120.0,360.0,1.0
4,6000,0.0,141.0,360.0,1.0
...,...,...,...,...,...
609,2900,0.0,71.0,360.0,1.0
610,4106,0.0,40.0,180.0,1.0
611,8072,240.0,253.0,360.0,1.0
612,7583,0.0,187.0,360.0,1.0


## Encoding Categorical Features

To fit the dataset into a machine learning model requires it to have all the features in numerical format. We need to perform categorical feature encoding on our dataset to make it suitable for training.

Your task is to first drop the Loan_ID column as it does not contain any information about the dataset.

Secondly, produce dummy variables for all the remaining non-numeric features other than the Loan_Status column.

Note
1. Loan_Status should not be converted to a dummy variable as it is a target column.

2. use drop_first=True while using pd.dummies to create the dummy variables.
Expected Output
Print the shape of the dataset after encoding categorical features

1.  Drop Loan Id
2.  Ozhivakk Loan Status
3. use get_dummies with drop_first = True for all categorical variables

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('loan_data_1.csv')

df.drop('Loan_ID', axis = 1, inplace = True)

cat_features = df.select_dtypes(include = "object").iloc[:, :-1]

cat_columns = cat_features.columns.to_list()

cat_features = pd.get_dummies(cat_features, drop_first = True)

df[cat_features.columns] = cat_features

df.drop(cat_columns, axis = 1, inplace = True)

print(df.shape)

(614, 15)


## Value Counts

After converting the categorical features to dummy variables, your task is to perform binary encoding on the Loan_Status target column.

Note
For 'Y' replace it with 1.
For 'N' replace it with 0.
Expected Output
Print the value count for the Loan_Status column after binary encoding

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('loan_data_2.csv')

df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})

print(df.Loan_Status.value_counts())

1    422
0    192
Name: Loan_Status, dtype: int64


## Outliers using Z score

After cleaning and encoding the categorical features, the next step is to work on the numerical features.

Following are the numerical features present in the dataset.

numeric_columns = ['ApplicantIncome',
                                 'CoapplicantIncome',
                                 'LoanAmount',
                                 'Loan_Amount_Term',
                                 'Credit_History']
Your task is to find the number of outliers present in each numerical column using the z-score method.

Note
Use threshold = 3 for finding outliers.
Expected Output
Print the column name and the number of outliers separated by space, for each column in a separate line.

column_name num_of_outliers
column_name num_of_outliers

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

df = pd.read_csv('loan_data_3.csv')

numeric_columns = ['ApplicantIncome',
                   'CoapplicantIncome',
                   'LoanAmount',
                   'Loan_Amount_Term',
                   'Credit_History']

outliers = df[abs(zscore(df[numeric_columns])) >3]

output = outliers.describe().loc['count', numeric_columns]

for col_name in output.index:
  print(col_name, output.loc[col_name])

ApplicantIncome 8.0
CoapplicantIncome 6.0
LoanAmount 15.0
Loan_Amount_Term 12.0
Credit_History 0.0


In [None]:
#CN Solution
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('loan_data.csv')
numeric_columns = ['ApplicantIncome',
'CoapplicantIncome',
'LoanAmount',
'Loan_Amount_Term',
'Credit_History']
for column in numeric_columns:
    outliers = []
    threshold = 3
    data = df[column]
    data_mean = np.mean(data)
    data_std = np.std(data)
    for d in data:
        z_score = (d - data_mean)/data_std
        if (np.abs(z_score) > threshold):
            outliers.append(d)
    print(column, len(outliers))

## IQR

Now, using the IQR method, you need to remove the outliers for all the features you detected in the precious question.

Expected Output

Print the mean for each of the numerical features and the mode for each of the categorical features.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

df = pd.read_csv('loan_data_3.csv')

numeric_columns = ['ApplicantIncome',
                   'CoapplicantIncome',
                   'LoanAmount',
                   'Loan_Amount_Term',
                   'Credit_History']

#quantiles for numerical features
q1, q3 = df[numeric_columns].quantile(0.25), df[numeric_columns].quantile(0.75)

IQR = q3 - q1

lower_bound = q1 - (1.5 * IQR)
upper_bound = q3 + (1.5 * IQR)

df_new = df[(df[numeric_columns] > lower_bound) & (df[numeric_columns] < upper_bound)]

print(df_new[numeric_columns].mean())

# cat_columns = df.select_dtypes(include = 'object').columns.to_list()

# print(df_new[cat_columns].mode())

# cat_columns

df[numeric_columns] = df_new[numeric_columns]

df.head()

ApplicantIncome      4124.723404
CoapplicantIncome    1289.129060
LoanAmount            129.190345
Loan_Amount_Term             NaN
Credit_History               NaN
dtype: float64


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,5849.0,0.0,146.412162,,,Y,1,0,0,0,0,0,0,0,1
1,4583.0,1508.0,128.0,,,N,1,1,1,0,0,0,0,0,0
2,3000.0,0.0,66.0,,,Y,1,1,0,0,0,0,1,0,1
3,2583.0,2358.0,120.0,,,Y,1,1,0,0,0,1,0,0,1
4,6000.0,0.0,141.0,,,Y,1,0,0,0,0,0,0,0,1


## Imbalance

We also need to work on the imbalance in the dataset present. For this, you need to visualise the frequency of the “loan_status” feature.

Expected Output

Print the number of “Yes” and “No” for loan_status.

In [None]:
import pandas as pd

df = pd.read_csv("loan_data_2.csv")

print(df.Loan_Status.value_counts())

Y    422
N    192
Name: Loan_Status, dtype: int64


## Resampling

You have been provided with the car purchase dataset; check whether the dataset is imbalanced and then find out which type of sampling is beneficial in this context.

Expected Output

After performing resampling, print the frequency of rows for each class.

In [14]:
import pandas as pd
from sklearn.utils import resample

df = pd.read_csv("loan_data_3.csv")

target = df.loc[:, "Loan_Status"]

target = target.map({'Y': 1,
                     'N': 0})

print("Original Values:")
print(target.value_counts())

majority = target[target == 1]
minority = target[target == 0]

minority_upsampled = resample(minority, n_samples = 422, random_state = 0)

target = pd.concat([majority, minority_upsampled])

print("After Upsampling Minority:")
print(target.value_counts())

Original Values:
1    422
0    192
Name: Loan_Status, dtype: int64
After Upsampling Minority:
1    422
0    422
Name: Loan_Status, dtype: int64


## Tuning

You need to do hyperparameter tuning for the loan dataset on the solver parameter. Use the “liblinear” solver and evaluate using the f1 score. Compare the liblinear and lbfgs solvers scores and determine which is better.

Expected Output

Print the f1 score for both solvers.

In [41]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

df = pd.read_csv("loan_data_3.csv")

features = df.drop(columns = ['Loan_Status'])
target = df.loc[:, "Loan_Status"]

target = target.map({'Y': 1,
                     'N': 0})

x_train, x_test, y_train, y_test = train_test_split(features, target, random_state = 0)

#solver = 'liblinear'
model_1 = LogisticRegression(max_iter = 1000, solver = 'liblinear')
model_1.fit(x_train, y_train)

y_pred_1 = model_1.predict(x_test)

# report_1 = classification_report(y_test, y_pred_1, output_dict = True)

# report_1 = pd.DataFrame(report_1)

print("f1-score for 'liblinear' solver:", f1_score(y_test, y_pred_1))


#solver = 'lbfgs'
model_2 = LogisticRegression(max_iter = 1000, solver = 'lbfgs')
model_2.fit(x_train, y_train)

y_pred_2 = model_2.predict(x_test)

# report_2 = classification_report(y_test, y_pred_2, output_dict = True)

# report_2 = pd.DataFrame(report_2)

print("f1-score for 'liblinear' solver:", f1_score(y_test, y_pred_2))

f1-score for 'liblinear' solver: 0.897119341563786
f1-score for 'liblinear' solver: 0.897119341563786


## Evaluate

You need to evaluate the model trained using the “liblinear” solver. Find out the f1, recall and precision for the loan dataset.

Expected Output

Print the precision.
Print the recall.
Print the f1 score.

In [43]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

df = pd.read_csv("loan_data_3.csv")

features = df.drop(columns = ['Loan_Status'])
target = df.loc[:, "Loan_Status"]

target = target.map({'Y': 1,
                     'N': 0})

x_train, x_test, y_train, y_test = train_test_split(features, target, random_state = 0)

#solver = 'liblinear'
model_1 = LogisticRegression(max_iter = 1000, solver = 'liblinear')
model_1.fit(x_train, y_train)

y_pred_1 = model_1.predict(x_test)

report_1 = classification_report(y_test, y_pred_1, output_dict = True)

report_1 = pd.DataFrame(report_1)

report_1["weighted avg"]

precision      0.849026
recall         0.837662
f1-score       0.818453
support      154.000000
Name: weighted avg, dtype: float64

## Visualize

After evaluating the logistic regression model on various parameters, you now need to visualise the ROC/AUC curve on the loan dataset.

Expected Output

Print the percentage of AUC.

In [45]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

df = pd.read_csv("loan_data_3.csv")

features = df.drop(columns = ['Loan_Status'])
target = df.loc[:, "Loan_Status"]

target = target.map({'Y': 1,
                     'N': 0})

x_train, x_test, y_train, y_test = train_test_split(features, target, random_state = 0)

#solver = 'liblinear'
model = LogisticRegression(max_iter = 1000, solver = 'liblinear')
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print(round(roc_auc_score(y_test, y_pred), 2))

0.72


## Classification Report

You have been provided with a dataset of the admission of students in a university, which contains three labels called “Selected”, “Not Selected” and “Waiting”.

You need to train a logistic regression model using default parameters and show the classification report in such a way that all the label names are printed.

Expected Output

Print the classification report

In [23]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

df = pd.read_csv("loan_data_3.csv")

features = df.drop(columns = ['Loan_Status'])
target = df.loc[:, "Loan_Status"]

target = target.map({'Y': 1,
                     'N': 0})

x_train, x_test, y_train, y_test = train_test_split(features, target, random_state = 0)

model = LogisticRegression(max_iter = 1000)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

report = classification_report(y_test, y_pred, output_dict = True)

report = pd.DataFrame(report)

report

                   0           1  accuracy   macro avg  weighted avg
precision   0.909091    0.825758  0.837662    0.867424      0.849026
recall      0.465116    0.981982  0.837662    0.723549      0.837662
f1-score    0.615385    0.897119  0.837662    0.756252      0.818453
support    43.000000  111.000000  0.837662  154.000000    154.000000


## Best Parameters

Using grid search CV, you need to train a logistic regression model on the admission dataset, by making a dictionary of parameters: [“penalty”, “dual”, “C”, “solver”].

After performing a grid search CV, you need to find the best parameters.

Expected Output

 Print the best parameters.

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

df = pd.read_csv("loan_data_3.csv")

features = df.drop(columns = ['Loan_Status'])
target = df.loc[:, "Loan_Status"]

target = target.map({'Y': 1,
                     'N': 0})

x_train, x_test, y_train, y_test = train_test_split(features, target, random_state = 0)

model = LogisticRegression(max_iter = 1000)

params = {"penalty": ['l1', 'l2', 'elasticnet', None],
          "dual": [True, False],
          "C": range(10),
          "solver": ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']}

find_params = GridSearchCV(model, params)

find_params.fit(x_train, y_train)

print(find_params.best_params_)

In [30]:
print(find_params.best_params_)

{'C': 1, 'dual': False, 'penalty': 'l2', 'solver': 'newton-cg'}


In [25]:
print(range(10))

range(0, 10)


## Weighted Average

On the admission dataset provided, you need to create a decision tree classifier and do hyperparameter tuning using grid search CV. After performing all the steps find out the best parameters.
Expected Output

Print the Weighted Average F1 score.

In [36]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

df = pd.read_csv("loan_data_3.csv")

features = df.drop(columns = ['Loan_Status'])
target = df.loc[:, "Loan_Status"]

target = target.map({'Y': 1,
                     'N': 0})

x_train, x_test, y_train, y_test = train_test_split(features, target, random_state = 0)

model = DecisionTreeClassifier()

params = {"criterion": ["gini", "entropy", "log_loss"],
          "splitter": ["best", "random"]}

find_params = GridSearchCV(model, params)

find_params.fit(x_train, y_train)

# print(find_params.best_params_)

best_params = find_params.best_params_

model = DecisionTreeClassifier(criterion = best_params['criterion'], splitter = best_params['splitter'])
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

report = classification_report(y_test, y_pred, output_dict = True)

report = pd.DataFrame(report)

print(report.loc['f1-score', 'weighted avg'])

{'criterion': 'log_loss', 'splitter': 'random'}
0.7291322314049588


## Best Parameters

For the “admission dataset”, you need to create a KNeighbors Classifier and do hyperparameter tuning on:

[
“algorithm”: [‘ball_tree’, ‘kd_tree’, ‘brute’],
“p”: [1, 2, 5],
“weights”: [‘uniform’, ‘distance’],
“n_neighbors”: [3, 5, 7]
]
Expected Output

Print the best parameters by using the f1 score.

In [38]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

df = pd.read_csv("loan_data_3.csv")

features = df.drop(columns = ['Loan_Status'])
target = df.loc[:, "Loan_Status"]

target = target.map({'Y': 1,
                     'N': 0})

x_train, x_test, y_train, y_test = train_test_split(features, target, random_state = 0)

params = {"algorithm": ['ball_tree', 'kd_tree', 'brute'],
          'p': [1, 2, 5],
          'weights': ['uniform', 'distance'],
          'n_neighbors': [3, 5, 7]}

find_param = GridSearchCV(KNeighborsClassifier(), params)
find_param.fit(x_train, y_train)

print(find_param.best_params_)

{'algorithm': 'ball_tree', 'n_neighbors': 7, 'p': 2, 'weights': 'uniform'}
