# Libraries for Handling Data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplot
import statsmodels.api as sm
import pandas as pd

# Importing Classification Models

In [2]:
from sklearn.linear_model import LogisticRegression as logReg
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as lda
from sklearn.naive_bayes import GaussianNB as nb
from sklearn.neighbors import KNeighborsClassifier as knn

ModuleNotFoundError: No module named 'sklearn'

# Libraries for Spliting data into training and testing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
data = pd.read_csv('Default.csv')
data.head()

In [None]:
data.student.unique()

In [None]:
data['student'].replace(['Yes','No'],[1,0], inplace=True)
data.head()

In [None]:
data['default'].replace(['Yes','No'],[1,0], inplace=True)
data.head()

In [None]:

def analyze_column(data, column, target_column):
  """Analyze a single column against the target."""
  # Scatter plot
  plt.figure(figsize=(8, 6))
  sns.scatterplot(data=data, x=column, y=target_column)
  plt.title(f'{column} vs {target_column}')
  plt.xlabel(column)
  plt.ylabel(target_column)
  plt.show()

  # Box plot
  plt.figure(figsize=(8, 6))
  sns.boxplot(data=data, x=target_column, y=column)
  plt.title(f'{column} vs {target_column}')
  plt.xlabel(target_column)
  plt.ylabel(column)
  plt.show()

  # # Histogram
  plt.figure(figsize=(8, 6))
  sns.histplot(data[column], kde=True)
  plt.title(f'{column} Distribution')
  plt.xlabel(column)
  plt.ylabel('Frequency')
  plt.show()

  # Correlation
  correlation = data[[column, target_column]].corr().iloc[0, 1]
  print(f'Correlation between {column} and {target_column}: {correlation}')

def analyze_all_columns(data, target_column):
  """Analyze all columns against the target."""
  feature_columns = [col for col in data.columns if col != target_column]
  for column in feature_columns:
    analyze_column(data, column, target_column)

def residuals_plot(model):
  ax = subplots(figsize=(8,8))[1]
  ax.scatter(model.fittedvalues, model.resid)
  ax.set_xlabel('Fitted value')
  ax.set_ylabel('Residual')
  ax.axhline(0, c='k', ls='--');


In [None]:
analyze_all_columns(data, 'default')

In [None]:
X = data.drop('default', axis=1)
y = data['default']


# Spliting data into Train and Test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1)
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

# Logistic Regression

In [None]:
X_train_logReg = sm.add_constant(X_train)
X_test_logReg = sm.add_constant(X_test)

print(X_train_logReg.shape)
print(X_test_logReg.shape)

In [None]:
X_train_logReg.head()

In [None]:
logRegModel = sm.Logit(y_train, X_train_logReg).fit() # fit is to create and train at the same time
print(logRegModel.summary())

In [None]:
y_hat = logRegModel.predict(X_test_logReg)
print(y_hat[:5])

In [None]:
preds = np.where(y_hat > 0.5, 1, 0)
print(preds[:5])

In [None]:
print(y_test[:5])

In [None]:
print(accuracy_score(y_test, preds))

In [None]:
print(confusion_matrix(y_test, preds))

# Linear Discriminant Analysis

In [None]:
lda_model = lda(store_covariance=True).fit(X_train, y_train) #create the model
# lda_model = lda().fit(X_train, y_train) # fit or train the model
preds = lda_model.predict(X_test) #  make predictions on the test data
print(accuracy_score(y_test, preds)) # show the test accuracy
print(confusion_matrix(y_test, preds))  # show the confusion matrix

# Naive Bayes

In [None]:
 #create the model
nb_model = nb().fit(X_train, y_train) # fit or train the model
preds = nb_model.predict(X_test) #  make predictions on the test data
print(accuracy_score(y_test, preds)) # show the test accuracy
print(confusion_matrix(y_test, preds))  # show the confusion matrix

# K Nearest Neighbours

Two disadvantages
: We need decide the no. of neighbours to consider
: Highly sensitive with outliners, always remove them before performing.

In [None]:
knn_model1 = knn(n_neighbors=3) #create the model
knn_model1.fit(X_train, y_train)   # fit or train the model
preds = knn_model1.predict(X_test) #  make predictions on the test data
print(accuracy_score(y_test, preds)) # show the test accuracy
print(confusion_matrix(y_test, preds))  # show the confusion matrix

In [None]:
knn_model2 = knn(n_neighbors=5) #create the model
knn_model2.fit(X_train, y_train)   # fit or train the model
preds = knn_model2.predict(X_test) #  make predictions on the test data
print(accuracy_score(y_test, preds)) # show the test accuracy
print(confusion_matrix(y_test, preds))  # show the confusion matrix

In [None]:
knn_model3 = knn(n_neighbors=7) #create the model
knn_model3.fit(X_train, y_train)   # fit or train the model
preds = knn_model3.predict(X_test) #  make predictions on the test data
print(accuracy_score(y_test, preds)) # show the test accuracy
print(confusion_matrix(y_test, preds))  # show the confusion matrix

In [None]:
for k in range(1,20, 2):
  knn_model= knn(n_neighbors=k)
  knn_model.fit(X_train, y_train)
  preds = knn_model.predict(X_test)
  print(f'Accuracy score when k = {k} : {accuracy_score(y_test, preds)}')
  print(f'Confusion Matrix when k = {k} : \n{confusion_matrix(y_test, preds)}\n')



1.   Select the best KNN model and Justify your selection
2.   Select the best classification model for the Default dataset and Justify your answer. Note: You want to select the model that can balance both the default and not default predictions




ICP Answers: Nithin Songala 16344141



1.   The best KNN according to me is the one with **N_Neighbours as 5**. Because when you look and compare the confusion matrix with other ones , Knn with 5 neighbours is more balanced with positives and Negatives as well. It has the maximum no. of correct predictions as well the proper ratio for wrong predictions.

2.   The best classification model for default dataset is **Naive Bayes.** Because compred to other models, navie bayes has more correct predictions and also proper ratio of wrong predictions. This model can balance both deafult and non-default predictions.
The second best model would be **Logistic Regression.**

