## Capstone - EDA

##### Reference - https://analyse.kmi.open.ac.uk/open_datasetb

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
%matplotlib inline 
sns.set(color_codes=True)


## Read in the CSV file

In [8]:
# read in the .csv file

data_df = pd.read_csv("C:/Users/sinea/OneDrive/Documents OneDrive/06 - CCT Masters in DA/Capstone - 2023/Capstone_Project_2023/Python workings  notebooks/filtered_df2.csv")


In [9]:
data_df.head(5)

Unnamed: 0,id_student,gender,highest_education,age_band,studied_credits,final_result,tenure
0,11391,0,HE Qualification,55<=,240,Pass,6
1,28400,1,HE Qualification,35-55,60,Pass,19
2,30268,1,A Level or Equivalent,35-55,60,Withdrawn,14
3,31604,1,A Level or Equivalent,35-55,60,Pass,10
4,32885,1,Lower Than A Level,0-35,60,Pass,7


In [10]:
# check for nan values and replace with zero

data_df.fillna(0, inplace=True)

## One-Hot Encoding of Categorical Data

1. Reference: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [11]:
from sklearn.preprocessing import OneHotEncoder
from numpy import asarray

In [12]:
# select the columns for one-hot encoding

columns_to_encode = ['highest_education', 'age_band', 'final_result']

In [13]:
# apply one-hot encoding using pd.get_dummies()

one_hot_encoded_df = pd.get_dummies(data_df, columns=columns_to_encode)

In [14]:
# concatenate the one-hot encoded DataFrame with the original DataFrame

data_df_encoded = pd.concat([data_df, one_hot_encoded_df], axis=1)

In [15]:
# remove the original columns that have been one-hot encoded

data_df_encoded.drop(columns=columns_to_encode, inplace=True)

In [16]:
# check the new df to make sure the one-hot encoding has worked 

#print(data_df_encoded)

data_df_encoded.head(3)

Unnamed: 0,id_student,gender,studied_credits,tenure,id_student.1,gender.1,studied_credits.1,tenure.1,highest_education_A Level or Equivalent,highest_education_HE Qualification,highest_education_Lower Than A Level,highest_education_Post Graduate Qualification,age_band_0-35,age_band_35-55,age_band_55<=,final_result_Distinction,final_result_Fail,final_result_Pass,final_result_Withdrawn
0,11391,0,240,6,11391,0,240,6,0,1,0,0,0,0,1,0,0,1,0
1,28400,1,60,19,28400,1,60,19,0,1,0,0,0,1,0,0,0,1,0
2,30268,1,60,14,30268,1,60,14,1,0,0,0,0,1,0,0,0,0,1


In [17]:
# convert the dtype of the recently one-hot encoded columns to int64 from uint8 dtype

columns_to_convert = [
    'highest_education_A Level or Equivalent',
    'highest_education_HE Qualification',
    'highest_education_Lower Than A Level',
    'highest_education_Post Graduate Qualification',
    'age_band_0-35',
    'age_band_35-55',
    'age_band_55<=',
    'final_result_Distinction',
    'final_result_Fail',
    'final_result_Pass',
    'final_result_Withdrawn'
]

# Convert the selected columns to int64 dtype
data_df_encoded[columns_to_convert] = data_df_encoded[columns_to_convert].astype('int64')

In [18]:
data_df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 19 columns):
 #   Column                                         Non-Null Count  Dtype
---  ------                                         --------------  -----
 0   id_student                                     383 non-null    int64
 1   gender                                         383 non-null    int64
 2   studied_credits                                383 non-null    int64
 3   tenure                                         383 non-null    int64
 4   id_student                                     383 non-null    int64
 5   gender                                         383 non-null    int64
 6   studied_credits                                383 non-null    int64
 7   tenure                                         383 non-null    int64
 8   highest_education_A Level or Equivalent        383 non-null    int64
 9   highest_education_HE Qualification             383 non-null    int64
 10  hi

## Logistic Regression code

In [19]:
# models  and classifiers for Supervised Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression

# libraries for the confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# create tables
from tabulate import tabulate

In [20]:
# drop any duplicate columns that might be present
# and stop the code being turned into a numpy array

data_df_encoded = data_df_encoded.loc[:, ~data_df_encoded.columns.duplicated()]


In [21]:
# divide the dataset into features (X) and the target variable (y)

# Target variable = gender
X = data_df_encoded.drop(columns=['gender'])
y = data_df_encoded['gender']

# Target Variable = studied_credits
X2 = data_df_encoded.drop(columns=['studied_credits'])
y2 = data_df_encoded['studied_credits']

# Target Variable = tenure
X3 = data_df_encoded.drop(columns=['tenure'])
y3 = data_df_encoded['tenure']

In [22]:
# Use .values to ensure y is 1-dimensional
# removes ValueError: y should be a 1d array, got an array of shape (306, 2) instead.

#y = y.values


In [23]:
# spliting the dataframe into Test and Train data for Algorithm 1

# target variable = gender
X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# target variable = studied_credits
X_Train2, X_Test2, y_Train2, y_Test2 = train_test_split(X2, y2, test_size = 0.2, random_state = 0)

# target variable = tenure
X_Train3, X_Test3, y_Train3, y_Test3 = train_test_split(X3, y3, test_size = 0.2, random_state = 0)


In [24]:
# creating a new classifer using logistic regression

logreg_clf = LogisticRegression()

#### Train and fit model using variable 1 - 'gender'

In [25]:
# define the hyperparameter grid to search

param_grid = {
    'C': [0.01, 0.1, 1, 10],  # Inverse of regularization strength
    'penalty': ['l1', 'l2']    # Regularization type (L1 or L2)
}

In [26]:
# perform Grid Search with 5-fold cross-validation

grid_search = GridSearchCV(logreg_clf, param_grid, cv=10)

grid_search.fit(X_Train, y_Train)

40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sinea\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sinea\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\sinea\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.64365591]


In [27]:
# print the best hyperparameters and corresponding accuracy

print("Best Hyperparameters for gender:", grid_search.best_params_)
print("Best Accuracy for gender:", grid_search.best_score_)

Best Hyperparameters for gender: {'C': 0.01, 'penalty': 'l2'}
Best Accuracy for gender: 0.6436559139784945


In [28]:
# training the classifier
# X_Train and y_Train values are created in the above cells

#logreg_clf.fit(X_Train, y_Train)

In [29]:
# train the classifier using the best hyperparameters on the entire training data

best_logreg_clf = grid_search.best_estimator_

best_logreg_clf.fit(X_Train, y_Train)

In [30]:
# evaluate the final model on the test data

test_accuracy = best_logreg_clf.score(X_Test, y_Test)

print("Test accuracy for gender:", test_accuracy)

Test accuracy for gender: 0.5714285714285714


In [31]:
# Predicting the result based on the test data

#y_pred = logreg_clf.predict(X_Test)
y_pred = best_logreg_clf.predict(X_Test)

In [32]:
# CONFUSION MATRIX ......to check the accuracy of the classification

y_pred_train = best_logreg_clf.predict(X_Train)

#print(y_pred_train)

In [33]:
# training the confusion matrix, based on predicted data and training data

cm_Train = confusion_matrix(y_pred_train, y_Train)

#print(cm_Train)

In [34]:
# testing the confusion matrix results, based on predicted and test data

cm_Test = confusion_matrix(y_pred, y_Test)

#print(cm_Test)

In [35]:
# print the Accuracy value of for test data 
# output is formatted in line with values take from the start of the 2d cm_Train array , 
# and position 1 of the cm_Train array, divided by the length of the y_Train 

logR_test = ((cm_Test[0][0] + cm_Test[1][1])/len(y_Test))

print('Accuracy for training set for Logistic Regression on gender = {:.2%}'.format(logR_test))

Accuracy for training set for Logistic Regression on gender = 57.14%


In [36]:
# print the Accuracy value of for test data 
# output is formatted in line with values take from the start of the 2d cm_Train array , 
# and position 1 of the cm_Train array, divided by the length of the y_Train 

logR_train = (cm_Train[0][0] + cm_Train[1][1])/len(y_Train)

print('Accuracy for training set for Logistic Regression on gender = {:.2%}'.format(logR_train))

Accuracy for training set for Logistic Regression on gender = 62.09%


#### Train and fit model using variable 2 - 'studied_credits'

In [37]:
# perform Grid Search with 10-fold cross-validation

grid_search2 = GridSearchCV(logreg_clf, param_grid, cv=10)

grid_search2.fit(X_Train2, y_Train2)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sinea\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sinea\Anaconda3\lib\site

In [38]:
# print the best hyperparameters and corresponding accuracy

print("Best Hyperparameters for studied credits:", grid_search.best_params_)
print("Best Accuracy for studied credits:", grid_search.best_score_)

Best Hyperparameters for studied credits: {'C': 0.01, 'penalty': 'l2'}
Best Accuracy for studied credits: 0.6436559139784945


In [39]:
# train the classifier using the best hyperparameters on the entire training data

best_logreg_clf2 = grid_search.best_estimator_

best_logreg_clf2.fit(X_Train2, y_Train2)

In [40]:
# evaluate the final model on the test data

test_accuracy2 = best_logreg_clf2.score(X_Test2, y_Test2)

print("Test accuracy for studied credits :", test_accuracy2)

Test accuracy for studied credits : 0.7012987012987013


In [41]:
# Predicting the result based on the test data

#y_pred = logreg_clf.predict(X_Test)
y_pred2 = best_logreg_clf2.predict(X_Test2)

In [42]:
# CONFUSION MATRIX ......to check the accuracy of the classification

y_pred_train2 = best_logreg_clf2.predict(X_Train2)

#print(y_pred_train)

In [43]:
# training the confusion matrix, based on predicted data and training data

cm_Train2 = confusion_matrix(y_pred_train2, y_Train2)

#print(cm_Train)

In [44]:
# print the Accuracy value of for test data 
# output is formatted in line with values take from the start of the 2d cm_Train array , 
# and position 1 of the cm_Train array, divided by the length of the y_Train 

logR_test2 = ((cm_Test[0][0] + cm_Test[1][1])/len(y_Test))

print('Accuracy for training set for Logistic Regression = {:.2%}'.format(logR_test))

Accuracy for training set for Logistic Regression = 57.14%


In [45]:
# testing the confusion matrix results, based on predicted and test data

cm_Test2 = confusion_matrix(y_pred2, y_Test2)

#print(cm_Test)

In [46]:
# print the Accuracy value of for test data 
# output is formatted in line with values take from the start of the 2d cm_Train array , 
# and position 1 of the cm_Train array, divided by the length of the y_Train 

logR_train2 = (cm_Train2[0][0] + cm_Train2[1][1])/len(y_Train2)

print('Accuracy for training set for Logistic Regression using studied credits = {:.2%}'.format(logR_train2))

Accuracy for training set for Logistic Regression using studied credits = 66.99%


#### Train and fit model using variable 2 - 'tenure'

In [47]:
# perform Grid Search with 10-fold cross-validation

grid_search3 = GridSearchCV(logreg_clf, param_grid, cv=10)

grid_search3.fit(X_Train3, y_Train3)

40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sinea\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sinea\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\sinea\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.07204301]


In [48]:
# print the best hyperparameters and corresponding accuracy

print("Best Hyperparameters for tenure:", grid_search3.best_params_)
print("Best Accuracy for tenure:", grid_search.best_score_)

Best Hyperparameters for tenure: {'C': 0.01, 'penalty': 'l2'}
Best Accuracy for tenure: 0.6436559139784945


In [57]:
# train the classifier using the best hyperparameters on the entire training data

best_logreg_clf3 = grid_search.best_estimator_

best_logreg_clf3.fit(X_Train3, y_Train3)

In [58]:
# evaluate the final model on the test data

test_accuracy3 = best_logreg_clf.score(X_Test3, y_Test3)

print("Test accuracy for tenure:", test_accuracy3)

Test accuracy for tenure: 0.09090909090909091


In [59]:
# Predicting the result based on the test data

#y_pred = logreg_clf.predict(X_Test)
y_pred3 = best_logreg_clf.predict(X_Test3)

In [60]:
# CONFUSION MATRIX ......to check the accuracy of the classification

y_pred_train3 = best_logreg_clf.predict(X_Train3)

#print(y_pred_train)

In [61]:
# training the confusion matrix, based on predicted data and training data

cm_Train3 = confusion_matrix(y_pred_train3, y_Train3)

#print(cm_Train)

In [62]:
# testing the confusion matrix results, based on predicted and test data

cm_Test3 = confusion_matrix(y_pred3, y_Test3)

#print(cm_Test)

In [63]:
# print the Accuracy value of for test data 
# output is formatted in line with values take from the start of the 2d cm_Train array , 
# and position 1 of the cm_Train array, divided by the length of the y_Train 

logR_test3 = ((cm_Test3[0][0] + cm_Test3[1][1])/len(y_Test3))

print('Accuracy for training set for Logistic Regression using tenure = {:.2%}'.format(logR_test3))

Accuracy for training set for Logistic Regression using tenure = 9.09%


In [64]:
# print the Accuracy value of for test data 
# output is formatted in line with values take from the start of the 2d cm_Train array , 
# and position 1 of the cm_Train array, divided by the length of the y_Train 

logR_train3 = (cm_Train3[0][0] + cm_Train3[1][1])/len(y_Train3)

print('Accuracy for training set for Logistic Regression using tenure = {:.2%}'.format(logR_train3))

Accuracy for training set for Logistic Regression using tenure = 7.52%


## References 

1. https://analyse.kmi.open.ac.uk/open_dataset, accessed 21 July 2023
2. https://stackoverflow.com/questions/58030352/csv-file-transpose-column-to-row-in-python, accessed on 25 July 2023
3. 