In [42]:
import pandas as pd
import numpy as np

In [43]:
df = pd.read_csv('./sales.csv')
df

Unnamed: 0,Age,Income,Gender,MaritialStatus,Buys
0,<21,High,Male,Single,No
1,<21,High,Male,Married,No
2,21-35,High,Male,Single,Yes
3,>35,Medium,Male,Single,Yes
4,>35,Low,Female,Single,Yes
5,>35,Low,Female,Married,No
6,21-35,Low,Female,Married,Yes
7,<21,Medium,Male,Single,No
8,<21,Low,Female,Married,Yes
9,>35,Medium,Female,Single,Yes


In [44]:
train_df = df.iloc[:-1].copy()
test_df = df.iloc[-1:].copy()

In [47]:
class Node1:
	def __init__(self, feature, value):
		self.yes = None
		self.no = None
		self.values = value
		self.feature = feature

	def __str__(self):
		return f"Feature: {self.feature}, Values: {self.value}"


class DT1:
	def __init__(self):
		self.tree = None
	
	def fit(self, X, y):
		self.set_feature_values(X)
		self.tree= self.build_tree(X, y)

	def set_feature_values(self, X):
		self.unique_feature_values = {}
		for feature in X.columns:
			self.unique_feature_values[feature] = np.unique(list(X[feature]))
	
	def build_tree(self, X, y, parent_impurity = 100):
		if(len(X.columns) == 0): return None
		best_features, best_values, impurity = self.select_best_feature(X, y)
		print(best_features, best_values, impurity)
		if impurity >= parent_impurity: return None
		node = Node1(best_features, best_values)

		X_yes, y_yes = self.filterdata(X, y, best_features, best_values, True)
		X_no, y_no = self.filterdata(X, y, best_features, best_values, False)

		node.yes = self.build_tree(X_yes, y_yes, impurity)
		node.no = self.build_tree(X_no, y_no, impurity)
		
		if node.yes is None: node.yes = True
		if node.no is None: node.no = False
		return node

	def select_best_feature(self, X, y):
		best_impurity = 100
		for feature in X.columns:
			values, impurity = self.get_feature_impurity(list(X[feature]), feature, y)
			print(values, impurity)
			if impurity <= best_impurity:
				best_impurity = impurity
				best_feature = feature
				best_values = values
		return best_feature, best_values, best_impurity

	def get_feature_impurity(self, X, feature, y):
		values = self.unique_feature_values[feature]
		n = 2** len(values) - 1
		best_impurity = 100
		for i in range(1, n):
			idx = self.parse(i, len(values))
			val_subset = values[idx].copy()
			impurity = self.get_impurity(X, y, val_subset)
			print("impurity = ", impurity, "feature = ", feature, " subset = ", val_subset)
			if impurity <= best_impurity:
				best_impurity = impurity
				best_values = val_subset
		# print("In get_feature_impurity ==> best values, best impurity  = ", feature, best_values, best_impurity)
		return best_values, best_impurity

	def parse(self, X, n):
		b = bin(X)[2:]
		b = (n - len(b))*'0' + b
		val = list(b)
		return [i for i in range(len(val)) if val[i] == '1']

	def get_impurity(self, X, y, values):
		yes_count = [0, 0]
		no_count = [0, 0]
		for i in range(len(X)):
			if X[i] in values:
				if y[i]:
					yes_count[1] += 1
				else:
					yes_count[0] += 1
			else:
				if y[i]:
					no_count[1] += 1
				else:
					no_count[0] += 1
		return self.gini(yes_count, no_count)

	def gini(self, yes_count, no_count):
		yes_total = yes_count[0] + yes_count[1]
		no_total = no_count[0] + no_count[1]
		try:
			gini_yes = 1 - (yes_count[0]/yes_total)**2 - (yes_count[1]/yes_total)**2 
		except: gini_yes = 0
		try:	
			gini_no = 1 - (no_count[0]/no_total)**2 - (no_count[1]/no_total)**2 
		except: gini_no = 0
		return (yes_total * gini_yes + no_total * gini_no)/(yes_total + no_total)

	def filterdata(self, X, y, feature, values, flag):
		X_filtered = X[X[feature].isin(values) == flag].copy()
		idx = list(X_filtered.index)
		X_filtered = X_filtered.reset_index().drop([feature, 'index'], axis = 1)
		y_filtered = y[idx].copy()
		return X_filtered, y_filtered

	def predict(self, X):
		preds = []
		for i in range(len(X)):
			preds.append(self.make_prediction(X.iloc[i], self.tree))
			return np.array(preds)

	def make_prediction(self, X, node):
		if type(node) == bool: return node
		value = X[node.feature]
		if value in node.values:
			node = node.yes
		else:
			node = node.no
		return self.make_prediction(X, node)


x_train, y_train = train_df.drop('Buys', axis=1), np.array(train_df['Buys']) == "Yes"
x_test = test_df.drop('Buys', axis=1)
clf = DT1()
clf.fit(x_train, y_train)
print(clf.predict(x_test))
print(f'Root: {clf.tree.no.feature}, Values: {clf.tree.no.values}')

impurity =  0.4230769230769231 feature =  Age  subset =  ['>35']
impurity =  0.31923076923076926 feature =  Age  subset =  ['<21']
impurity =  0.3418803418803419 feature =  Age  subset =  ['<21' '>35']
impurity =  0.3418803418803419 feature =  Age  subset =  ['21-35']
impurity =  0.31923076923076926 feature =  Age  subset =  ['21-35' '>35']
impurity =  0.4230769230769231 feature =  Age  subset =  ['21-35' '<21']
['21-35' '>35'] 0.31923076923076926
impurity =  0.4115384615384615 feature =  Income  subset =  ['Medium']
impurity =  0.4230769230769231 feature =  Income  subset =  ['Low']
impurity =  0.39316239316239315 feature =  Income  subset =  ['Low' 'Medium']
impurity =  0.39316239316239315 feature =  Income  subset =  ['High']
impurity =  0.4230769230769231 feature =  Income  subset =  ['High' 'Medium']
impurity =  0.4115384615384615 feature =  Income  subset =  ['High' 'Low']
['High'] 0.39316239316239315
impurity =  0.3626373626373626 feature =  Gender  subset =  ['Male']
impurity =