In [760]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
import dmba
from pathlib import Path
import matplotlib.pylab as plt
from dmba import classificationSummary

# We start with importing necessary pacakages 

In [761]:
#Question1 : Partition the data into training (75%) and validation (25%) sets.
bank_df = dmba.load_data('UniversalBank.csv')
bank_df

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3,40,92697,1,1.9,3,0,0,0,0,1,0
4996,4997,30,4,15,92037,4,0.4,1,85,0,0,0,1,0
4997,4998,63,39,24,93023,2,0.3,3,0,0,0,0,0,0
4998,4999,65,40,49,90034,3,0.5,2,0,0,0,0,1,0


In [762]:
bank_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal Loan       5000 non-null   int64  
 10  Securities Account  5000 non-null   int64  
 11  CD Account          5000 non-null   int64  
 12  Online              5000 non-null   int64  
 13  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 547.0 KB


In [763]:
bank_df = pd.get_dummies(bank_df, columns = ['Education'], drop_first = False, dtype = int)

We're changing the 'Education' section of the 'bank_df' data into numbers using method one-hot encoding, making it easier for our analysis.

In [764]:
bank_df.drop(['ID','ZIP Code'], axis = 1, inplace = True)

'ID' and 'Zip Code' are unnecessary, they are being dropped

In [765]:
X = bank_df.drop('Personal Loan', axis = 1)
y = bank_df[['Personal Loan']]

In [766]:
# Splitting the data into train and test as per the Question 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [767]:
#Question2 : Consider the following customer for classification: Age = 40, Experience = 10,
#Income = 84, Family = 2, CCAvg = 2, Education_1 = 0, Education_2 = 1, Education_3 = 0, Mortgage = 0, 
#Securities Account = 1, CD Account = 1, Online = 1, and Credit Card = 1

features = {'Age':40, 'Experience':10, 'Income':84, 
            'Family':2, 'CCAvg':2, 'Mortgage':0, 
            'Securities Account':1, 'CD Account':1, 
            'Online':1, 'CreditCard':1,'Education_1':0, 
            'Education_2':1, 'Education_3':0, }

In [768]:
features_df = pd.DataFrame(features,index=[0])

Using these features, we can predict whether a customer is likely to accept a loan offer using a KNN model.

In [769]:
#Question3 : Standardize all the data sets using mean and standard deviations.
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
#This function works on the principle of Z scores, which is "Z = (X - Mean)/Standard Deviation,"
#essentially scaling the units of data, improving model performance.

In [770]:
# We are scaling the data here so that various units of measurements could be aptly comprehended by the model.

X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns, index=X_train.index)

X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

scaled_customer_data = pd.DataFrame(scaler.transform(features_df), columns=features_df.columns)

When our datasets include measurements in various units, it's essential to apply feature scaling. Without it, our models may work unfairly, favoring some data over others. By scaling features, we level the playing field, improving the model's fairness and its ability to manage skewed data and outliers that might otherwise skew its predictions.

In [771]:
#Question4  : Perform a k-NN classification with all predictors except ID and ZIP code using k = 1.

KNN_model = NearestNeighbors(n_neighbors=1)
KNN_model.fit(X_train, y_train)

spacing, indices = KNN_model.kneighbors(scaled_customer_data)

print(y_train.loc[y_train.index[indices[0]]])
indices

      Personal Loan
1322              0


array([[2004]])

In [772]:
X_train.take([2004])

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Mortgage,Securities Account,CD Account,Online,CreditCard,Education_1,Education_2,Education_3
1322,-1.171065,-1.325634,-0.56305,-0.33917,-0.157005,-0.554029,2.926829,3.894072,0.818312,1.56419,-0.851429,1.599336,-0.652576


In [773]:
y_train.take([2004])

Unnamed: 0,Personal Loan
1322,0


The K-Nearest Neighbors (KNN) algorithm employs the Euclidean metric to identify the closest data points. With k set to 1, we are seeking the single nearest neighbor to the new customer instance. 
The training data point with the index 2004 emerges as the nearest neighbor to the new customer. 
This particular data point corresponds to a Personal Loan value of 0. Consequently, the new customer will be assigned the class label 0, indicating that they are likely to decline the loan offer.

In [774]:
#Question5 : Now find the optimal value of k using the validation data set. What is the optimal k?

from sklearn.metrics import accuracy_score

#KNN models, predict, and calculate accuracy together
accuracy_results = [
    {
        'k': k,
        'accuracy': accuracy_score(y_test, KNeighborsClassifier(n_neighbors=k).fit(X_train, y_train).predict(X_test))
    }
    for k in range(1, 15)
]

# Transforming the list of results into a DataFrame for analysis
k_accuracy = pd.DataFrame(accuracy_results)
print(k_accuracy)

  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


     k  accuracy
0    1    0.9640
1    2    0.9536
2    3    0.9616
3    4    0.9568
4    5    0.9624
5    6    0.9568
6    7    0.9608
7    8    0.9536
8    9    0.9576
9   10    0.9536
10  11    0.9576
11  12    0.9528
12  13    0.9536
13  14    0.9520


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


The best k value is 1, achieving an accuracy rate of 0.9640.

In [775]:
#Question6 : Print the confusion matrix for the validation data that results from using the optimal k
from sklearn.metrics import accuracy_score, confusion_matrix

# Training the K-Nearest Neighbors model
model_knn_1 = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train)

# Predicting the test set outcomes
test_predictions = model_knn_1.predict(X_test)

# Computing the accuracy of the model
model_accuracy = accuracy_score(y_test, test_predictions)

confusion_matrix_results = confusion_matrix(y_test, test_predictions)

classificationSummary(y_test, model_knn_1.predict(X_test))
print("Confusion Matrix for optimal k =", model_accuracy)

  return self._fit(X, y)


Confusion Matrix (Accuracy 0.9640)

       Prediction
Actual    0    1
     0 1108   17
     1   28   97
Confusion Matrix for optimal k = 0.964


The model precisely identified 1108 instances where customers were accurately predicted not to pursue a personal loan, reflecting its strong capability in recognizing true negatives (TN). In 17 instances, however, the model incorrectly anticipated that customers would secure a personal loan, marking these as false positives (FP). There were 28 occurrences where the model failed to predict an actual personal loan uptake, indicating false negatives (FN). Conversely, the model successfully pinpointed 97 scenarios where customers indeed opted for a personal loan, showcasing its effectiveness with true positives (TP). With a precision rate reaching 96.40%, the model exhibits a significant proficiency in discerning customers' likelihood to acquire a personal loan.

In [776]:
#Question7 : Classify the customer specified in Question 2 using the best k.
#Prediction for the potential loan customer
customer_prediction = model_knn_1.predict(scaled_customer_data)
print(f"Classification outcome for the potential customer: {customer_prediction}")

Classification outcome for the potential customer: [0]


In [777]:
#Question8 : Now repartition the data into three parts: training, validation, and test sets (50%, 30%, and 20%).

def split_dataset_train_validate_test_split(dataframe, train_ratio=0.5, validation_ratio=0.3, random_seed=123):
    np.random.seed(random_seed)
    shuffled_indices = np.random.permutation(dataframe.index)
    total_size = len(dataframe.index)
    
    end_of_train = int(train_ratio * total_size)
    end_of_validation = int(validation_ratio * total_size) + end_of_train
    
    training_set = dataframe.iloc[shuffled_indices[:end_of_train]]
    validation_set = dataframe.iloc[shuffled_indices[end_of_train:end_of_validation]]
    testing_set = dataframe.iloc[shuffled_indices[end_of_validation:]]
    
    return training_set, validation_set, testing_set

train_new, validate_new, test_new = split_dataset_train_validate_test_split(bank_df)
print("Training Data:",train_new.shape)
print("Validation Data:", validate_new.shape)
print("Testing Data:", test_new.shape)

Training Data: (2500, 14)
Validation Data: (1500, 14)
Testing Data: (1000, 14)


In [778]:
# Isolating features and target variable for the training subset
features_train = train_new.drop(columns=['Personal Loan'])
target_train = train_new['Personal Loan'].to_frame()

# Preparing the test subset in a similar fashion
features_test = test_new.drop(columns=['Personal Loan'])
target_test = test_new['Personal Loan'].to_frame()

# And doing the same for the validation subset
features_validate = validate_new.drop(columns=['Personal Loan'])
target_validate = validate_new['Personal Loan'].to_frame()

# Applying scaling to the training features
scaler.fit(features_train)

In [779]:
# Scaling the training features and preserving the DataFrame structure
features_train = pd.DataFrame(scaler.transform(features_train),
                                     columns=features_train.columns,
                                     index=features_train.index)

# Applying the same scaling transformation to the test features
features_test = pd.DataFrame(scaler.transform(features_test),
                                    columns=features_test.columns,
                                    index=features_test.index)

# And scaling the validation features accordingly
features_validate = pd.DataFrame(scaler.transform(features_validate),
                                        columns=features_validate.columns,
                                        index=features_validate.index)

In [780]:
#Question9 : Apply the k-NN method with the optimal k chosen above
features = bank_df.columns.difference(['Personal Loan'])
features

Index(['Age', 'CCAvg', 'CD Account', 'CreditCard', 'Education_1',
       'Education_2', 'Education_3', 'Experience', 'Family', 'Income',
       'Mortgage', 'Online', 'Securities Account'],
      dtype='object')

In [781]:

# Training the K-Nearest Neighbors model
model_knn_1 = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train)

# Predicting the test set outcomes
test_predictions = model_knn_1.predict(X_test)

# Computing the accuracy of the model
model_accuracy = accuracy_score(y_test, test_predictions)

confusion_matrix_results = confusion_matrix(y_test, test_predictions)

classificationSummary(y_test, model_knn_1.predict(X_test))
print("Model accuracy for k 1 =", model_accuracy)

Confusion Matrix (Accuracy 0.9640)

       Prediction
Actual    0    1
     0 1108   17
     1   28   97
Model accuracy for k 1 = 0.964


  return self._fit(X, y)


In [782]:
#Question10 : Compare the confusion matrix of the test set with that of the training and validationsets. 
#Comment on the differences and their reason.

# Computing the confusion matrix for the training data predictions
conf_matrix_training = confusion_matrix(target_train, model_knn_1.predict(features_train))

In [783]:
print("Training Data : ")

classificationSummary(target_train, model_knn_1.predict(features_train))

Training Data : 
Confusion Matrix (Accuracy 0.9824)

       Prediction
Actual    0    1
     0 2244   18
     1   26  212


In [784]:
conf_matrix_validation = confusion_matrix(target_validate, model_knn_1.predict(features_validate))

In [785]:
print("Validation Data : ")
classificationSummary(target_validate, model_knn_1.predict(features_validate))

Validation Data : 
Confusion Matrix (Accuracy 1.0000)

       Prediction
Actual    0    1
     0 1348    0
     1    0  152


In [786]:
conf_matrix_test = confusion_matrix(target_test, model_knn_1.predict(features_test))

In [787]:
print("Testing Data : ")
classificationSummary(target_test, model_knn_1.predict(features_test))

Testing Data : 
Confusion Matrix (Accuracy 1.0000)

       Prediction
Actual   0   1
     0 910   0
     1   0  90


Achieving a perfect accuracy score of 1.0 on both validation and test sets is quite unusual and may suggest that the model is overfitting, especially with a KNN model using k=1. Typically, such high accuracy on unseen data could indicate that the test and validation datasets might have inadvertently been contaminated with training data or there’s a leak of information from the training set.


The root cause of these differences in performance is likely overfitting and impact of hyperparameters, where the model has learned the training data too well, capturing noise and patterns specific to the training set that don't generalize well to other data. This overfitting is evident from the stark contrast between the training accuracy and the lower scores on the validation and test sets. Also, Selecting the right hyperparameters, such as the number of neighbors (k), is crucial for a model's performance, as improper choices can hamper its ability to generalize, necessitating careful experimentation to optimize accuracy.