In [34]:
# Import libraries

import pandas as pd
import numpy as np

import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation

%matplotlib inline 
sns.set(color_codes=True)

# create tables
from tabulate import tabulate

### Import test file data

In [2]:
# read in the .csv file - of filtered student_info

data_df = pd.read_csv("test_data.csv")

In [3]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 749 entries, 0 to 748
Data columns (total 16 columns):
 #   Column                                         Non-Null Count  Dtype
---  ------                                         --------------  -----
 0   id_student                                     749 non-null    int64
 1   gender                                         749 non-null    int64
 2   studied_credits                                749 non-null    int64
 3   tenure                                         749 non-null    int64
 4   highest_education_A Level or Equivalent        749 non-null    int64
 5   highest_education_HE Qualification             749 non-null    int64
 6   highest_education_Lower Than A Level           749 non-null    int64
 7   highest_education_No Formal quals              749 non-null    int64
 8   highest_education_Post Graduate Qualification  749 non-null    int64
 9   age_band_0-35                                  749 non-null    int64
 10  ag

In [4]:
# check for nan values and replace with zero

data_df.fillna(0, inplace=True)

In [5]:
#data_df.info()

### Algorithm 2 - Decision Tree - Classifier

In [6]:
# import libraries

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [7]:
# divide the dataset into features (X) and the target variable (y)

# variable 1 = gender
X = data_df.drop(columns=['gender'])  # drop target variable from the dataframe
y = data_df['gender']        # id target variable 

# variable 2 = studied_credits
X2 = data_df.drop(columns=['studied_credits'])
y2 = data_df['studied_credits']

# variable 3 = tenure
X3 = data_df.drop(columns=['tenure'])
y3 = data_df['tenure']

In [8]:
# split the data into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=0)
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=0.2, random_state=0)

In [9]:
# create Decision Tree with criterion entropy

classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)

### Fit and tune with model for variable 1 = gender

Hyperparm tuning completed with GridSearch CV


In [10]:
# apply parmameter grid to identify best param's 

param_grid = {'max_depth': list(range(1, 21)), 'max_features': ['auto', 'sqrt', 'log2', None]}

In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
# instantiate the grid for data search
grid = GridSearchCV(classifier, param_grid, cv=10, scoring='accuracy')

# fit the grid with data
grid.fit(X_train, y_train)














In [13]:
# show the best model and retrain classifier 

print("The best accuracy score using GridSearchCV is",grid.best_score_)
print("The best parameters using GridSearchCV are", grid.best_params_)

The best accuracy score using GridSearchCV is 0.8080225988700566
The best parameters using GridSearchCV are {'max_depth': 1, 'max_features': 'auto'}


In [14]:
# train the model using the best param's identified sets

classifier = DecisionTreeClassifier(criterion='entropy', 
                                    max_depth=grid.best_params_['max_depth'], 
                                    max_features=grid.best_params_['max_features'], 
                                    random_state=0
                                   )


In [15]:
classifier.fit(X_train, y_train)



In [16]:
# make predictions on the test set

y_pred = classifier.predict(X_test)

In [17]:
# calculate Model Accuracy

print("The score for accuracy using the model is:", accuracy_score(y_test, y_pred))

The score for accuracy using the model is: 0.7933333333333333


### Fit and tune with model for variable 2 - studied_credits

In [18]:
# instantiate the grid for data search
grid2 = GridSearchCV(classifier, param_grid, cv=10, scoring='accuracy')

# fit the grid with data
grid2.fit(X_train2, y_train2)













In [19]:
# show the best model and retrain classifier 

print("The best accuracy score using GridSearchCV is", grid.best_score_)
print("The best parameters using GridSearchCV are", grid.best_params_)

The best accuracy score using GridSearchCV is 0.8080225988700566
The best parameters using GridSearchCV are {'max_depth': 1, 'max_features': 'auto'}


In [20]:
# train the model using the training sets

classifier2 = DecisionTreeClassifier(criterion='entropy', 
                                    max_depth=grid2.best_params_['max_depth'], 
                                    max_features=grid2.best_params_['max_features'], 
                                    random_state=0
                                   )


In [21]:
classifier2.fit(X_train2, y_train2)



In [22]:
# make predictions on the test set

y_pred2 = classifier2.predict(X_test2)

In [23]:
# calculate Model Accuracy

print("The score for accuracy using the studied_credits is:", accuracy_score(y_test2, y_pred2))

The score for accuracy using the studied_credits is: 0.92


### Fit and tune with model for variable 3 - tenure

In [24]:
# define the parameter values that should be searched

param_grid = {'max_depth': list(range(1, 21)), 'max_features': ['auto', 'sqrt', 'log2', None]}

In [25]:
# instantiate the grid for data search
grid3 = GridSearchCV(classifier, param_grid, cv=10, scoring='accuracy')

# fit the grid with data
grid3.fit(X_train3, y_train3)














In [26]:
# show the best model and retrain classifier 

print("The best accuracy score using GridSearchCV is", grid3.best_score_)
print("The best parameters using GridSearchCV are", grid3.best_params_)

The best accuracy score using GridSearchCV is 0.06847457627118644
The best parameters using GridSearchCV are {'max_depth': 1, 'max_features': 'auto'}


In [27]:
# train the model using the training sets

classifier3 = DecisionTreeClassifier(criterion='entropy', 
                                    max_depth=grid3.best_params_['max_depth'], 
                                    max_features=grid3.best_params_['max_features'], 
                                    random_state=0
                                   )


In [28]:
classifier3.fit(X_train3, y_train3)



In [29]:
# make predictions on the test set

y_pred3 = classifier3.predict(X_test3)

In [30]:
# calculate Model Accuracy

print("The score for accuracy using tenure is:", accuracy_score(y_test3, y_pred3))

The score for accuracy using tenure is: 0.04


### Display the results for the 3 tests

In [37]:
algorithm_1_results = {'gender': [accuracy_score(y_test, y_pred)], 
        'studied_credits': [accuracy_score(y_test2, y_pred2)], 
        'Tenure':[accuracy_score(y_test3, y_pred3)]}

print(tabulate(algorithm_1_results, headers='keys', tablefmt='fancy_grid'))

╒══════════╤═══════════════════╤══════════╕
│   gender │   studied_credits │   Tenure │
╞══════════╪═══════════════════╪══════════╡
│ 0.793333 │              0.92 │     0.04 │
╘══════════╧═══════════════════╧══════════╛
