In [1]:
# Import libraries

import pandas as pd
import numpy as np
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
%matplotlib inline 
sns.set(color_codes=True)


### Read in the dataset

In [2]:
# read in the .csv file - of filtered student_info

data_df = pd.read_csv("test_data.csv")

In [3]:
#data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 749 entries, 0 to 748
Data columns (total 16 columns):
 #   Column                                         Non-Null Count  Dtype
---  ------                                         --------------  -----
 0   id_student                                     749 non-null    int64
 1   gender                                         749 non-null    int64
 2   studied_credits                                749 non-null    int64
 3   tenure                                         749 non-null    int64
 4   highest_education_A Level or Equivalent        749 non-null    int64
 5   highest_education_HE Qualification             749 non-null    int64
 6   highest_education_Lower Than A Level           749 non-null    int64
 7   highest_education_No Formal quals              749 non-null    int64
 8   highest_education_Post Graduate Qualification  749 non-null    int64
 9   age_band_0-35                                  749 non-null    int64
 10  ag

In [4]:
# check for nan values and replace with zero

data_df.fillna(0, inplace=True)

### Algorithm 1 - Logistic Regression 

In [6]:
# models  and classifiers for Supervised Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression

# libraries for the confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# create tables
from tabulate import tabulate

In [9]:
# drop any duplicate columns that might be present
# and stop the code being turned into a numpy array

data_df = data_df.loc[:, ~data_df.columns.duplicated()]


In [10]:
# divide the dataset into features (X) and the target variable (y)

# Target variable = gender
X = data_df.drop(columns=['gender'])
y = data_df['gender']

# Target Variable = studied_credits
X2 = data_df.drop(columns=['studied_credits'])
y2 = data_df['studied_credits']

# Target Variable = tenure
X3 = data_df.drop(columns=['tenure'])
y3 = data_df['tenure']

In [11]:
# spliting the dataframe into Test and Train data for Algorithm 1

# target variable = gender
X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# target variable = studied_credits
X_Train2, X_Test2, y_Train2, y_Test2 = train_test_split(X2, y2, test_size = 0.2, random_state = 0)

# target variable = tenure
X_Train3, X_Test3, y_Train3, y_Test3 = train_test_split(X3, y3, test_size = 0.2, random_state = 0)


In [12]:
# creating a new classifer using logistic regression

logreg_clf = LogisticRegression()

#### Train and fit model using variable 1 - 'gender'

In [13]:
# define the hyperparameter grid to search

param_grid = {
    'C': [0.01, 0.1, 1, 10],  # Inverse of regularization strength
    'penalty': ['l1', 'l2']    # Regularization type (L1 or L2)
}

In [14]:
# perform Grid Search with 5-fold cross-validation

grid_search = GridSearchCV(logreg_clf, param_grid, cv=10)

grid_search.fit(X_Train, y_Train)

40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sinea\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sinea\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\sinea\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

 0.8080226]


In [15]:
# print the best hyperparameters and corresponding accuracy

print("Best Hyperparameters for gender:", grid_search.best_params_)
print("Best Accuracy for gender:", grid_search.best_score_)

Best Hyperparameters for gender: {'C': 0.01, 'penalty': 'l2'}
Best Accuracy for gender: 0.8080225988700566


In [16]:
# train the classifier using the best hyperparameters on the entire training data

best_logreg_clf = grid_search.best_estimator_

best_logreg_clf.fit(X_Train, y_Train)

In [17]:
# evaluate the final model on the test data

test_accuracy = best_logreg_clf.score(X_Test, y_Test)

print("Test accuracy for gender:", test_accuracy)

Test accuracy for gender: 0.7933333333333333


In [18]:
# Predicting the result based on the test data

#y_pred = logreg_clf.predict(X_Test)
y_pred = best_logreg_clf.predict(X_Test)

In [21]:
# CONFUSION MATRIX ......to check the accuracy of the classification

y_pred_train = best_logreg_clf.predict(X_Train)

#print(y_pred_train)

In [22]:
# training the confusion matrix, based on predicted data and training data

cm_Train = confusion_matrix(y_pred_train, y_Train)

#print(cm_Train)

In [23]:
# testing the confusion matrix results, based on predicted and test data

cm_Test = confusion_matrix(y_pred, y_Test)

#print(cm_Test)

In [24]:
# print the Accuracy value of for test data 
# output is formatted in line with values take from the start of the 2d cm_Train array , 
# and position 1 of the cm_Train array, divided by the length of the y_Train 

logR_test = ((cm_Test[0][0] + cm_Test[1][1])/len(y_Test))

print('Accuracy for training set for Logistic Regression on gender = {:.2%}'.format(logR_test))

Accuracy for training set for Logistic Regression on gender = 79.33%


In [25]:
# print the Accuracy value of for test data 
# output is formatted in line with values take from the start of the 2d cm_Train array , 
# and position 1 of the cm_Train array, divided by the length of the y_Train 

logR_train = (cm_Train[0][0] + cm_Train[1][1])/len(y_Train)

print('Accuracy for training set for Logistic Regression on gender = {:.2%}'.format(logR_train))

Accuracy for training set for Logistic Regression on gender = 80.80%


In [27]:
# print the Accuracy value of for test data 
# output is formatted in line with values take from the start of the 2d cm_Train array , 
# and position 1 of the cm_Train array, divided by the length of the y_Train 

logR_test = ((cm_Test[0][0] + cm_Test[1][1])/len(y_Test))

print('Accuracy for test set for Logistic Regression on gender = {:.2%}'.format(logR_test))

Accuracy for test set for Logistic Regression on gender = 79.33%


### Train and fit model using variable 2 - 'studied_credits'

In [28]:
# perform Grid Search with 10-fold cross-validation

grid_search2 = GridSearchCV(logreg_clf, param_grid, cv=10)

grid_search2.fit(X_Train2, y_Train2)

40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sinea\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sinea\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\sinea\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.93322034]


In [29]:
# print the best hyperparameters and corresponding accuracy

print("Best Hyperparameters for studied credits:", grid_search.best_params_)
print("Best Accuracy for studied credits:", grid_search.best_score_)

Best Hyperparameters for studied credits: {'C': 0.01, 'penalty': 'l2'}
Best Accuracy for studied credits: 0.8080225988700566


In [30]:
# train the classifier using the best hyperparameters on the entire training data

best_logreg_clf2 = grid_search.best_estimator_

best_logreg_clf2.fit(X_Train2, y_Train2)

In [31]:
# evaluate the final model on the test data

test_accuracy2 = best_logreg_clf2.score(X_Test2, y_Test2)

print("Test accuracy for studied credits :", test_accuracy2)

Test accuracy for studied credits : 0.92


In [32]:
# Predicting the result based on the test data

#y_pred = logreg_clf.predict(X_Test)
y_pred2 = best_logreg_clf2.predict(X_Test2)

In [33]:
# CONFUSION MATRIX ......to check the accuracy of the classification

y_pred_train2 = best_logreg_clf2.predict(X_Train2)

#print(y_pred_train)

In [34]:
# training the confusion matrix, based on predicted data and training data

cm_Train2 = confusion_matrix(y_pred_train2, y_Train2)

#print(cm_Train)

In [38]:
# print the Accuracy value of for test data 
# output is formatted in line with values take from the start of the 2d cm_Train array , 
# and position 1 of the cm_Train array, divided by the length of the y_Train 

logR_test2 = ((cm_Test[0][0] + cm_Test[1][1])/len(y_Test))

print('Accuracy for test set for Logistic Regression = {:.2%}'.format(logR_test))

Accuracy for test set for Logistic Regression = 79.33%


In [36]:
# testing the confusion matrix results, based on predicted and test data

cm_Test2 = confusion_matrix(y_pred2, y_Test2)

#print(cm_Test)

In [37]:
# print the Accuracy value of for test data 
# output is formatted in line with values take from the start of the 2d cm_Train array , 
# and position 1 of the cm_Train array, divided by the length of the y_Train 

logR_train2 = (cm_Train2[0][0] + cm_Train2[1][1])/len(y_Train2)

print('Accuracy for training set for Logistic Regression using studied credits = {:.2%}'.format(logR_train2))

Accuracy for training set for Logistic Regression using studied credits = 93.32%


### Train and fit model using variable 2 - 'tenure'

In [39]:
# perform Grid Search with 10-fold cross-validation

grid_search3 = GridSearchCV(logreg_clf, param_grid, cv=10)

grid_search3.fit(X_Train3, y_Train3)

40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sinea\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sinea\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\sinea\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.07011299]


In [40]:
# print the best hyperparameters and corresponding accuracy

print("Best Hyperparameters for tenure:", grid_search3.best_params_)
print("Best Accuracy for tenure:", grid_search.best_score_)

Best Hyperparameters for tenure: {'C': 0.01, 'penalty': 'l2'}
Best Accuracy for tenure: 0.8080225988700566


In [41]:
# train the classifier using the best hyperparameters on the entire training data

best_logreg_clf3 = grid_search.best_estimator_

best_logreg_clf3.fit(X_Train3, y_Train3)

In [42]:
# evaluate the final model on the test data

test_accuracy3 = best_logreg_clf.score(X_Test3, y_Test3)

print("Test accuracy for tenure:", test_accuracy3)

Test accuracy for tenure: 0.02666666666666667


In [43]:
# Predicting the result based on the test data

#y_pred = logreg_clf.predict(X_Test)
y_pred3 = best_logreg_clf.predict(X_Test3)

In [44]:
# CONFUSION MATRIX ......to check the accuracy of the classification

y_pred_train3 = best_logreg_clf.predict(X_Train3)

#print(y_pred_train)

In [45]:
# training the confusion matrix, based on predicted data and training data

cm_Train3 = confusion_matrix(y_pred_train3, y_Train3)

#print(cm_Train)

In [46]:
# testing the confusion matrix results, based on predicted and test data

cm_Test3 = confusion_matrix(y_pred3, y_Test3)

#print(cm_Test)

In [52]:
# print the Accuracy value of for test data 
# output is formatted in line with values take from the start of the 2d cm_Train array , 
# and position 1 of the cm_Train array, divided by the length of the y_Train 

logR_test3 = ((cm_Test3[0][0] + cm_Test3[1][1])/len(y_Test3))

print('Accuracy for test set for Logistic Regression using tenure = {:.2%}'.format(logR_test3))

Accuracy for test set for Logistic Regression using tenure = 0.00%


In [53]:
# print the Accuracy value of for test data 
# output is formatted in line with values take from the start of the 2d cm_Train array , 
# and position 1 of the cm_Train array, divided by the length of the y_Train 

logR_train3 = (cm_Train3[0][0] + cm_Train3[1][1])/len(y_Train3)

print('Accuracy for training set for Logistic Regression using tenure = {:.2%}'.format(logR_train3))

Accuracy for training set for Logistic Regression using tenure = 0.00%


### Display results of the 3 variables

In [51]:
algorithm_4_results = {'gender': [logR_test], 
        'studied_credits': [logR_test2], 
        'Tenure':[logR_test3]}

print(tabulate(algorithm_4_results, headers='keys', tablefmt='fancy_grid'))

╒══════════╤═══════════════════╤══════════╕
│   gender │   studied_credits │   Tenure │
╞══════════╪═══════════════════╪══════════╡
│ 0.793333 │          0.793333 │        0 │
╘══════════╧═══════════════════╧══════════╛
