## Capstone - EDA

##### Reference - https://analyse.kmi.open.ac.uk/open_datasetb

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
%matplotlib inline 
sns.set(color_codes=True)


## Read in the CSV file

In [2]:
# read in the .csv file

data_df = pd.read_csv("filtered_df2.csv")

In [3]:
data_df.head(5)

Unnamed: 0,id_student,gender,highest_education,age_band,studied_credits,final_result,tenure
0,11391,0,HE Qualification,55<=,240,Pass,6
1,28400,1,HE Qualification,35-55,60,Pass,19
2,30268,1,A Level or Equivalent,35-55,60,Withdrawn,14
3,31604,1,A Level or Equivalent,35-55,60,Pass,10
4,32885,1,Lower Than A Level,0-35,60,Pass,7


## One-Hot Encoding of Categorical Data

1. Reference: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [4]:
from sklearn.preprocessing import OneHotEncoder
from numpy import asarray

In [5]:
# select the columns for one-hot encoding

columns_to_encode = ['highest_education', 'age_band', 'final_result']

In [6]:
# apply one-hot encoding using pd.get_dummies()

one_hot_encoded_df = pd.get_dummies(data_df, columns=columns_to_encode)

In [7]:
# concatenate the one-hot encoded DataFrame with the original DataFrame

data_df_encoded = pd.concat([data_df, one_hot_encoded_df], axis=1)

In [8]:
# remove the original columns that have been one-hot encoded

data_df_encoded.drop(columns=columns_to_encode, inplace=True)

In [9]:
# check the new df to make sure the one-hot encoding has worked 

#print(data_df_encoded)

data_df_encoded.head(3)

Unnamed: 0,id_student,gender,studied_credits,tenure,id_student.1,gender.1,studied_credits.1,tenure.1,highest_education_A Level or Equivalent,highest_education_HE Qualification,highest_education_Lower Than A Level,highest_education_Post Graduate Qualification,age_band_0-35,age_band_35-55,age_band_55<=,final_result_Distinction,final_result_Fail,final_result_Pass,final_result_Withdrawn
0,11391,0,240,6,11391,0,240,6,0,1,0,0,0,0,1,0,0,1,0
1,28400,1,60,19,28400,1,60,19,0,1,0,0,0,1,0,0,0,1,0
2,30268,1,60,14,30268,1,60,14,1,0,0,0,0,1,0,0,0,0,1


In [10]:
# convert the dtype of the recently one-hot encoded columns to int64 from uint8 dtype

columns_to_convert = [
    'highest_education_A Level or Equivalent',
    'highest_education_HE Qualification',
    'highest_education_Lower Than A Level',
    'highest_education_Post Graduate Qualification',
    'age_band_0-35',
    'age_band_35-55',
    'age_band_55<=',
    'final_result_Distinction',
    'final_result_Fail',
    'final_result_Pass',
    'final_result_Withdrawn'
]

# Convert the selected columns to int64 dtype
data_df_encoded[columns_to_convert] = data_df_encoded[columns_to_convert].astype('int64')

In [11]:
data_df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 19 columns):
 #   Column                                         Non-Null Count  Dtype
---  ------                                         --------------  -----
 0   id_student                                     383 non-null    int64
 1   gender                                         383 non-null    int64
 2   studied_credits                                383 non-null    int64
 3   tenure                                         383 non-null    int64
 4   id_student                                     383 non-null    int64
 5   gender                                         383 non-null    int64
 6   studied_credits                                383 non-null    int64
 7   tenure                                         383 non-null    int64
 8   highest_education_A Level or Equivalent        383 non-null    int64
 9   highest_education_HE Qualification             383 non-null    int64
 10  hi

## Logistic Regression code

In [12]:
# models  and classifiers for Supervised Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression

# libraries for the confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# create tables
from tabulate import tabulate

In [13]:
# drop any duplicate columns that might be present
# and stop the code being turned into a numpy array

data_df_encoded = data_df_encoded.loc[:, ~data_df_encoded.columns.duplicated()]


In [14]:
# divide the dataset into features (X) and the target variable (y)
X = data_df_encoded.drop(columns=['gender'])
y = data_df_encoded['gender']

In [15]:
# Use .values to ensure y is 1-dimensional
# removes ValueError: y should be a 1d array, got an array of shape (306, 2) instead.

#y = y.values


In [16]:
# spliting the dataframe into Test and Train data for Algorithm 1
X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [17]:
# creating a new classifer using logistic regression

logreg_clf = LogisticRegression()

#### Hyperparam Training

In [18]:
# define the hyperparameter grid to search

param_grid = {
    'C': [0.01, 0.1, 1, 10],  # Inverse of regularization strength
    'penalty': ['l1', 'l2']    # Regularization type (L1 or L2)
}

In [19]:
# perform Grid Search with 10-fold cross-validation

grid_search = GridSearchCV(logreg_clf, param_grid, cv=10)

grid_search.fit(X_Train, y_Train)

40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sinea\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sinea\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\sinea\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.64365591]


In [20]:
# print the best hyperparameters and corresponding accuracy

print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

Best Hyperparameters: {'C': 0.01, 'penalty': 'l2'}
Best Accuracy: 0.6436559139784945


In [21]:
# training the classifier
# X_Train and y_Train values are created in the above cells

#logreg_clf.fit(X_Train, y_Train)

In [22]:
# train the classifier using the best hyperparameters on the entire training data

best_logreg_clf = grid_search.best_estimator_

best_logreg_clf.fit(X_Train, y_Train)

In [23]:
# evaluate the final model on the test data

test_accuracy = best_logreg_clf.score(X_Test, y_Test)

print("Test accuracy:", test_accuracy)

Test accuracy: 0.5714285714285714


In [24]:
# Predicting the result based on the test data

#y_pred = logreg_clf.predict(X_Test)
y_pred = best_logreg_clf.predict(X_Test)

In [25]:
# CONFUSION MATRIX ......to check the accuracy of the classification

y_pred_train = best_logreg_clf.predict(X_Train)

#print(y_pred_train)

In [26]:
# training the confusion matrix, based on predicted data and training data

cm_Train = confusion_matrix(y_pred_train, y_Train)

#print(cm_Train)

In [27]:
# testing the confusion matrix results, based on predicted and test data

cm_Test = confusion_matrix(y_pred, y_Test)

#print(cm_Test)

In [28]:
# print the Accuracy value of for test data 
# output is formatted in line with values take from the start of the 2d cm_Train array , 
# and position 1 of the cm_Train array, divided by the length of the y_Train 

logR_test = ((cm_Test[0][0] + cm_Test[1][1])/len(y_Test))

print('Accuracy for training set for Logistic Regression = {:.2%}'.format(logR_test))

Accuracy for training set for Logistic Regression = 57.14%


In [29]:
# print the Accuracy value of for test data 
# output is formatted in line with values take from the start of the 2d cm_Train array , 
# and position 1 of the cm_Train array, divided by the length of the y_Train 

logR_train = (cm_Train[0][0] + cm_Train[1][1])/len(y_Train)

print('Accuracy for training set for Logistic Regression = {:.2%}'.format(logR_train))

Accuracy for training set for Logistic Regression = 62.09%


## References 

1. https://analyse.kmi.open.ac.uk/open_dataset, accessed 21 July 2023
2. https://stackoverflow.com/questions/58030352/csv-file-transpose-column-to-row-in-python, accessed on 25 July 2023
3. 