In [29]:
import numpy as np
import pandas as pd
import csv
from collections import Counter, defaultdict
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [30]:
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        
    def _best_split(self, X, y):
        # Need at least two elements to split a node.
        m = y.size
        if m <= 1:
            return None, None 
       
        best_idx = None
        num_samples_per_class = list(Counter(y).values())
        
        # Gini of current node.
        best_gini = 1.0 - sum((n / m) ** 2 for n in num_samples_per_class)
        label = self.label_
        # Loop through all features and find the best feature with least gini.
        for idx in range(self.n_features_):            
            target, classes = zip(*sorted(zip(X[:, idx], y)))
           #Slipts and counts the number of democrates and republicans 
         #based on the features 
            num_left = [0] * self.n_classes_
            num_right = [0] * self.n_classes_         
            for i in range(0, m):  # loop through all the rows
                c = classes[i]
                t = target[i]
                if(t == "y"):
                    if(c == label[0]):
                        num_left[0] += 1
                    else:
                        num_left[1] += 1
                else:
                    if(c == label[0]):
                        num_right[0] += 1
                    else:
                        num_right[1] += 1
                        
                #Gives gini impurity if left split     
            if(sum(num_left) > 0):    
                gini_left = 1.0 - sum(
                (num_left[x] / sum(num_left)) ** 2 for x in range(self.n_classes_)
                 )
                #Gives gini impurity if right split
            if(sum(num_right)> 0):
                gini_right = 1.0 - sum(
                (num_right[x] / sum(num_right)) ** 2 for x in range(self.n_classes_)
                )
                # Gives the average weighted gini impurity 
            if(sum(num_right)> 0 and sum(num_left) > 0):   
                gini = (sum(num_left) * gini_left + sum(num_right) * gini_right) / m
                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
        return best_gini, best_idx
    
    def fit(self, X, y):
        # This will start building a decision tree.
        self.n_classes_ = len(np.unique(y)) # gives number of output
        self.label_ = np.unique(y)
        self.n_features_ = X.shape[1]
        self.tree_ = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        # This will recursively build the tree.
        num_samples_per_class = list(Counter(y).values())
        predicted_class = Counter(y).most_common(1)[0][0]
        node = Node(
            gini= 1.0 - sum((n / sum(num_samples_per_class)) ** 2 for n in num_samples_per_class),
            num_samples=y.size,
            num_samples_per_class=num_samples_per_class,
            predicted_class=predicted_class,
        )
        # Split recursively until maximum depth is reached.
        if depth < self.max_depth:
            bestgini, idx = self._best_split(X, y)
            if idx is not None:
                if(bestgini < node.gini): 
                    indices_left = (X[:, idx] == "y")
                    X_left, y_left = X[indices_left], y[indices_left]
                    X_right, y_right = X[~indices_left], y[~indices_left]
                    node.feature_idx = idx
                    node.left = self._build_tree(X_left, y_left, depth + 1)
                    node.right = self._build_tree(X_right, y_right, depth + 1)        
        return node
    
    def predict(self, X):
        return [self._predictor(inputs) for inputs in X]
        
    def _predictor(self, inputs):
        node = self.tree_
        while (node.left != None):
            if (inputs[node.feature_idx] == "y"):
                node = node.left
            else:
                node = node.right
        return node.predicted_class
    
    

In [31]:
class Node:
    def __init__(self, gini, num_samples, num_samples_per_class, predicted_class):
        self.gini = gini
        self.num_samples = num_samples
        self.num_samples_per_class = num_samples_per_class
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.left = None
        self.right = None
        

In [32]:



feature_cols = ["handicapped-infants",
    "water-project-cost-sharing",
    "adoption-of-the-budget-resolution", 
    "physician-fee-freeze",
    "el-salvador-aid",
    "religious-groups-in-schools",
    "anti-satellite-test-ban",
    "aid-to-nicaraguan-contras", 
    "mx-missile",
    "immigration",
    "synfuels-corporation-cutback",
    "education-spending",
    "superfund-right-to-sue",
    "crime",
    "duty-free-exports",
    "export-administration-act-south-africa",]
fields = ["Class Name","handicapped-infants",
    "water-project-cost-sharing",
    "adoption-of-the-budget-resolution", 
    "physician-fee-freeze",
    "el-salvador-aid",
    "religious-groups-in-schools",
    "anti-satellite-test-ban",
    "aid-to-nicaraguan-contras", 
    "mx-missile",
    "immigration",
    "synfuels-corporation-cutback",
    "education-spending",
    "superfund-right-to-sue",
    "crime",
    "duty-free-exports",
    "export-administration-act-south-africa",]

df = pd.read_csv("house-votes-84.csv", names = fields, skiprows=1)
X = df[feature_cols] # Features
y = df["Class Name"] # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1) 

data = X_train.to_numpy()
data2 = y_train.to_numpy()
X = data[:, :]
y = data2[: ,]

model = DecisionTree(5)

model.fit(X, y) 


In [33]:
data = X_test.to_numpy()
Z = data[:,:]
y = data[: ,] 

In [34]:
y_pred = model.predict(Z)

In [35]:
y_pred

['democrat',
 'democrat',
 'republican',
 'republican',
 'republican',
 'republican',
 'democrat',
 'democrat',
 'republican',
 'democrat',
 'republican',
 'democrat',
 'republican',
 'democrat',
 'democrat',
 'republican',
 'democrat',
 'republican',
 'republican',
 'democrat',
 'democrat',
 'democrat',
 'democrat',
 'democrat',
 'democrat',
 'republican',
 'democrat',
 'democrat',
 'democrat',
 'democrat',
 'democrat',
 'republican',
 'democrat',
 'democrat',
 'democrat',
 'democrat',
 'democrat',
 'republican',
 'democrat',
 'republican',
 'democrat',
 'democrat',
 'democrat',
 'republican',
 'republican',
 'democrat',
 'democrat',
 'democrat',
 'republican',
 'democrat',
 'democrat',
 'republican',
 'democrat',
 'democrat',
 'democrat',
 'democrat',
 'democrat',
 'democrat',
 'democrat',
 'republican',
 'democrat',
 'democrat',
 'democrat',
 'democrat',
 'republican',
 'republican',
 'democrat',
 'republican',
 'democrat',
 'republican',
 'democrat',
 'democrat',
 'democrat',
 'rep

In [36]:
Counter(y_pred)

Counter({'democrat': 91, 'republican': 40})

In [37]:

Counter(y_test)

Counter({'democrat': 89, 'republican': 42})

In [38]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9389312977099237


For this Homework, I decided to work with 1948 Congressional Voting Records and create model for predicting party affiliation. After creating and training my decision tree model with training data which is 0.70 of the total dataset.I tested the model with rest of the 0.30 of the total dataset several time. My decision tree is about 94% accurate in predicting the party affiliation(Democrat/Republican) when I use 5 as the depth of the tree. The accuracy fluctuate based on the depth of the decision tree used to create the model.  