## Importing libraries

In [1]:
import numpy as np
import pandas as pd

## Importing dataset

In [2]:
dataset = pd.read_csv('../../data/data-6-model.csv')
X = dataset.iloc[:, :-4].values
y = dataset.iloc[:, -4].values
print(X[0])
print(y[0])
dataset.head()

[6 'Mathematics' 'Multiplication' 'Art' 50]
Visual


Unnamed: 0,grade,subject,lesson,class_interests,class_average_mark,teaching_aid_category,teaching_aid,how_to_use,time
0,6,Mathematics,Multiplication,Art,50,Visual,Math Models,Create visual models to understand multiplicat...,20
1,6,Mathematics,Circles,Travelling,73,Kinesthetic,Math Trail,Go on a math trail to explore circle propertie...,30
2,6,Mathematics,Circles,Technology,85,Digital,Interactive Software,Use interactive software to visualize and mani...,25
3,6,Mathematics,Circles,Games,30,Kinesthetic,Team Quiz,Organize a quiz game where students answer cir...,45
4,6,Mathematics,Positional Value,Games,53,Kinesthetic,Number Relay,Participate in a relay race that requires arra...,50


## Encode Independent variables

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(sparse_output = False), [1, 2, 3])], remainder='passthrough')
X = ct.fit_transform(X)
print(X[0])

[1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 6 50]


## Encode Dependant variables

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y[0])

4


## Split dataset into Test set and Train set

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

## Feature scaling

In [6]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 15:] = sc.fit_transform(X_train[:, 15:])
X_test[:, 15:] = sc.transform(X_test[:, 15:])
print(X_train[0])

[1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 6
 -0.3707623099578751]


## Train Random Forest model on the Train set

In [7]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

## Predict the Test set results

In [8]:
y_pred = classifier.predict(X_test)

## Create Confusion Matrix

In [9]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))

[[11  0  0  0]
 [ 0  3  0  0]
 [ 1  0 19  3]
 [ 0  0  0 37]]
0.9459459459459459


## Applying K-Fold Cross Validation

In [10]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy : {:.2f}".format(accuracies.mean()*100))
print("Standard Deviation : {:.2f}".format(accuracies.std()*100))

Accuracy : 93.17
Standard Deviation : 5.30


## Applying GridSearch to find the best hyperparameters

In [11]:
from sklearn.model_selection import GridSearchCV
parameters = [{
    'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, None],
    'min_samples_split': [2, 5],
    'max_features': ['sqrt', 'log2'],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
    }]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy : {:.2f}".format(best_accuracy*100))
print("Best parameters : ", best_parameters)

Best Accuracy : 94.21
Best parameters :  {'bootstrap': True, 'criterion': 'gini', 'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 40}
