### Description

> Andrea Becerra BolaÃ±os & David Aguilar Castilleja

The selected dataset is a compilation of the student's information about their performance in three different areas (Reading, Writing and Math). Also, it comes with extra information that may affect or not the performance of the student, such as parents' education, lunch (nutrition), gender, race/ethnicity and if they completed the study course.  

The objective of this decision tree implementation is to predict if the student is going to approve or fail, based on the average of the three main scores.

We normalize the scores into 10 categories instead of 0 - 100 score and created a new column with binary information (0 fail - 1 passed) that we use as labels for the trainning.


In [None]:
# Dependecies
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
import sklearn
import graphviz

pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
# Get the path of input data - KAGGLE
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
# Create the Classifier
students_classifier = tree.DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
 max_features=None, max_leaf_nodes=None, random_state=0, splitter='best')

# Import Data
data = pd.read_csv("/kaggle/input/students-performance-in-exams/StudentsPerformance.csv")

In [None]:
# Create average column and determine if student passed or failed
data['passed'] = data['math score'] + data['writing score'] + data['reading score']
data['passed'] = data['passed']/3
data['passed'] = (data['passed']> 59.9).astype(int)
data = shuffle(data)

In [None]:
# Partition of the relevant data columns
split_data = train_test_split(data, test_size=.20)

train_data = split_data[0]
test_data = split_data[1]

train_labels = train_data.passed.tolist()
test_labels = test_data.passed.tolist()

# Normalize Data
train_data.loc[:, "writing score"] = train_data["writing score"] // 10
train_data.loc[:, "math score"] = train_data["math score"] // 10
train_data.loc[:, "reading score"] = train_data["reading score"] // 10


test_data.loc[:, "writing score"] = test_data["writing score"] // 10
test_data.loc[:, "math score"] = test_data["math score"] // 10
test_data.loc[:, "reading score"] = test_data["reading score"] // 10


train_data = train_data.drop(columns = "passed")
test_data = test_data.drop(columns = "passed")


In [None]:
# Format Data
train_data = pd.get_dummies(train_data, drop_first=True)
test_data = pd.get_dummies(test_data, drop_first=True)

print("Columns of sample")
print(train_data.columns)
print("------------- First Row ---------------------")
print(train_data.iloc[0])



In [None]:
# Training
trained = students_classifier.fit(train_data, train_labels)

In [None]:
# Test Data and Accuracy
test_predictions = trained.predict(test_data).tolist()
ID3TestAccuracy = sklearn.metrics.accuracy_score(test_labels, test_predictions)

print('ID3 Testing accuracy: ',ID3TestAccuracy)

train_predictions = trained.predict(train_data).tolist()
ID3TrainAccuracy = sklearn.metrics.accuracy_score(train_labels, train_predictions)

print('ID3 Training accuracy: ',ID3TrainAccuracy)

In [None]:
# Plot with Matplotlib
fig, ax = plt.subplots(figsize=(70, 40))
tree.plot_tree(trained, feature_names=train_data.columns,  max_depth=10, fontsize=18)
plt.show()

In [None]:
# Plot with GraphViz
dot_data = tree.export_graphviz(trained, out_file=None, feature_names=train_data.columns, filled=True, rounded=True)
graph = graphviz.Source(dot_data)
graph.render("Students")
graph