In [1]:
# Import libraries

import pandas as pd
import numpy as np
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation

%matplotlib inline 
sns.set(color_codes=True)

### Import filtered Student's file 

In [2]:
# read in the .csv file - of filtered student_info

data_df = pd.read_csv("C:/Users/sinea/OneDrive/Documents OneDrive/06 - CCT Masters in DA/Capstone - 2023/Capstone_Project_2023/Python workings  notebooks/filtered_df2.csv")


In [3]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id_student         383 non-null    int64 
 1   gender             383 non-null    int64 
 2   highest_education  383 non-null    object
 3   age_band           383 non-null    object
 4   studied_credits    383 non-null    int64 
 5   final_result       383 non-null    object
 6   tenure             383 non-null    int64 
dtypes: int64(4), object(3)
memory usage: 21.1+ KB


In [4]:
# check for nan values and replace with zero

data_df.fillna(0, inplace=True)

### Encoding categorical data

Using One-Hot Encoding

In [5]:
# One-Hot Encoding for 'highest_education', 'age_band', 'final_result', 'tenure_band'

#data_df_encoded = pd.get_dummies(data_df, columns=['highest_education', 'age_band', 'final_result', 'tenure_band'])
data_df_encoded = pd.get_dummies(data_df, columns=['highest_education', 'age_band', 'final_result'])

In [6]:
data_df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 15 columns):
 #   Column                                         Non-Null Count  Dtype
---  ------                                         --------------  -----
 0   id_student                                     383 non-null    int64
 1   gender                                         383 non-null    int64
 2   studied_credits                                383 non-null    int64
 3   tenure                                         383 non-null    int64
 4   highest_education_A Level or Equivalent        383 non-null    uint8
 5   highest_education_HE Qualification             383 non-null    uint8
 6   highest_education_Lower Than A Level           383 non-null    uint8
 7   highest_education_Post Graduate Qualification  383 non-null    uint8
 8   age_band_0-35                                  383 non-null    uint8
 9   age_band_35-55                                 383 non-null    uint8
 10  ag

In [7]:
# convert the dtype of the recently one-hot encoded columns to int64 from uint8 dtype

columns_to_convert = [
    'highest_education_A Level or Equivalent',
    'highest_education_HE Qualification',
    'highest_education_Lower Than A Level',
    'highest_education_Post Graduate Qualification',
    'age_band_0-35',
    'age_band_35-55',
    'age_band_55<=',
    'final_result_Distinction',
    'final_result_Fail',
    'final_result_Pass',
    'final_result_Withdrawn', 
#    'tenure_band_0-5',
#    'tenure_band_11-15',
#    'tenure_band_16 or more',
#    'tenure_band_5-10'
]

# Convert the selected columns to int64 dtype
data_df_encoded[columns_to_convert] = data_df_encoded[columns_to_convert].astype('int64')

In [8]:
#data_df_encoded.info()

## Algorithm 2 - Decision Tree - Classifier

#### https://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression_multioutput.html#sphx-glr-auto-examples-tree-plot-tree-regression-multioutput-py

In [9]:
# import libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [10]:
# divide the dataset into features (X) and the target variable (y)

# variable 1 = gender
X = data_df_encoded.drop(columns=['gender'])  # drop target variable from the dataframe
y = data_df_encoded['gender']        # id target variable 

# variable 2 = studied_credits
X2 = data_df_encoded.drop(columns=['studied_credits'])
y2 = data_df_encoded['studied_credits']

# variable 3 = tenure
X3 = data_df_encoded.drop(columns=['tenure'])
y3 = data_df_encoded['tenure']

In [21]:
# split the data into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=0)
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=0.2, random_state=0)

In [12]:
# create Decision Tree with criterion entropy

#classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)

### Fit and tune with model for variable 1 = gender 

Hyperparm tuning completed with GridSearch CV

In [13]:
# define the parameter values that should be searched
# instance, max_depth parameter can be any integer from 1 to 20
# max_features can be auto, sqrt, log2 or None

# Test 1
#param_grid = {'max_depth': list(range(1, 21)), 'max_features': ['auto', 'sqrt', 'log2', None]}
param_grid = {'max_depth': list(range(1, 21)), 'max_features': ['auto', 'sqrt', 'log2', None]}

In [14]:
# instantiate the grid for data search
grid = GridSearchCV(classifier, param_grid, cv=10, scoring='accuracy')

# fit the grid with data
grid.fit(X_train, y_train)












In [15]:
# show the best model and retrain classifier 

print("The best accuracy score using GridSearchCV is"grid.best_score_)
print("The best parameters using GridSearchCV are", grid.best_params_)

0.623978494623656
{'max_depth': 4, 'max_features': None}


In [16]:
# train the model using the training sets

classifier = DecisionTreeClassifier(criterion='entropy', 
                                    max_depth=grid.best_params_['max_depth'], 
                                    max_features=grid.best_params_['max_features'], 
                                    random_state=0
                                   )


In [17]:
classifier.fit(X_train, y_train)

In [18]:
# make predictions on the test set

y_pred = classifier.predict(X_test)

In [24]:
# calculate Model Accuracy

print("The score for accuracy using the model is:", accuracy_score(y_test, y_pred))

The score for accuracy using the model is: 0.5194805194805194


### Fit and tune with model for variable 2 -  studied_credits

In [22]:
# instantiate the grid for data search
grid2 = GridSearchCV(classifier, param_grid, cv=10, scoring='accuracy')

# fit the grid with data
grid2.fit(X_train2, y_train2)














In [23]:
# show the best model and retrain classifier 

print("The best accuracy score using GridSearchCV is", grid.best_score_)
print("The best parameters using GridSearchCV are", grid.best_params_)

The best accuracy score using GridSearchCV is 0.623978494623656
The best parameters using GridSearchCV are {'max_depth': 4, 'max_features': None}


In [29]:
# train the model using the training sets

classifier2 = DecisionTreeClassifier(criterion='entropy', 
                                    max_depth=grid2.best_params_['max_depth'], 
                                    max_features=grid2.best_params_['max_features'], 
                                    random_state=0
                                   )


In [30]:
classifier2.fit(X_train2, y_train2)

In [31]:
# make predictions on the test set

y_pred2 = classifier2.predict(X_test2)

Feature names unseen at fit time:
- tenure
Feature names seen at fit time, yet now missing:
- studied_credits



In [None]:
# calculate Model Accuracy

print("The score for accuracy using the studied_credits is:", accuracy_score2(y_test2, y_pred2))

### Fit and tune with model for variable 2 -  studied_credits

In [None]:
# define the parameter values that should be searched
# instance, max_depth parameter can be any integer from 1 to 20
# max_features can be auto, sqrt, log2 or None

# Test 1
#param_grid = {'max_depth': list(range(1, 21)), 'max_features': ['auto', 'sqrt', 'log2', None]}
param_grid = {'max_depth': list(range(1, 21)), 'max_features': ['auto', 'sqrt', 'log2', None]}

In [None]:
# instantiate the grid for data search
grid3 = GridSearchCV(classifier, param_grid, cv=10, scoring='accuracy')

# fit the grid with data
grid3.fit(X_train3, y_train3)


In [None]:
# show the best model and retrain classifier 

print("The best accuracy score using GridSearchCV is"grid3.best_score_)
print("The best parameters using GridSearchCV are", grid3.best_params_)

In [None]:
# train the model using the training sets

classifier3 = DecisionTreeClassifier(criterion='entropy', 
                                    max_depth=grid3.best_params_['max_depth'], 
                                    max_features=grid3.best_params_['max_features'], 
                                    random_state=0
                                   )


In [None]:
classifier3.fit(X_train3, y_train3)

In [None]:
# make predictions on the test set

y_pred3 = classifier3.predict(X_test3)

In [None]:
# calculate Model Accuracy

print("The score for accuracy using tenure is:", accuracy_score3(y_test3, y_pred3))

#### References

1. https://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression_multioutput.html#sphx-glr-auto-examples-tree-plot-tree-regression-multioutput-py
2. https://medium.com/towards-data-science/decision-tree-classifier-explained-in-real-life-picking-a-vacation-destination-6226b2b60575
3. 