# 09_05: A taste of machine learning

In [None]:
import math
import collections
import dataclasses
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as pp

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# fix random number seed for repeatable computation
np.random.seed(43)

In [None]:
gm = pd.read_csv('gapminder.csv')

In [None]:
gm2023 = gm[gm.year == 2023]
gm2023.head()

In [None]:
x = gm2023[['population', 'life_expectancy', 'age5_surviving', 'babies_per_woman', 'gdp_per_capita']]

In [None]:
y = gm2023['region'].astype('category')

In [None]:
z = y.cat.codes
z.head(10)

In [None]:
x_train, x_test, z_train, z_test = train_test_split(x, z, test_size=0.3)

In [None]:
tree = DecisionTreeClassifier()

In [None]:
tree.fit(x_train, z_train)

In [None]:
tree_pred = tree.predict(x_test)
tree_pred

In [None]:
pd.DataFrame({'label': y.cat.categories[z_test],
              'predicted': y.cat.categories[tree_pred]}).head(20)

In [None]:
accuracy_score(z_test, tree_pred)

In [None]:
np.sum(y.cat.categories[z_test] == y.cat.categories[tree_pred]) / len(z_test)

In [None]:
print(classification_report(z_test, tree_pred, target_names=y.cat.categories))

In [None]:
pp.imshow(confusion_matrix(z_test, tree_pred, normalize='true'))
pp.xticks(range(5), y.cat.categories); pp.yticks(range(5), y.cat.categories)
pp.colorbar();

In [None]:
forest = RandomForestClassifier()

In [None]:
forest.fit(x_train, z_train)

In [None]:
forest_pred = forest.predict(x_test)

In [None]:
print(classification_report(z_test, forest_pred, target_names=y.cat.categories))

In [None]:
pp.imshow(confusion_matrix(z_test, forest_pred, normalize='true'))
pp.xticks(range(5), y.cat.categories); pp.yticks(range(5), y.cat.categories)
pp.colorbar();