In [None]:
pip install pandas numpy scikit-learn



In [None]:
# @title Library Imports
import math
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [None]:
# @title Decision Tree Helper Methods and Entropy/Density Calculations
def findUniqueVals(df):
  attributes = df.columns[:-1]
  uniqueAttributeValues = {}
  for attr in attributes:
    uniqueAttributeValues[attr] = {}
    counts = df[attr].value_counts().index.tolist()
    for idx, name in enumerate(counts):
      uniqueAttributeValues[attr][name] = df[attr].value_counts()[idx]
  return uniqueAttributeValues

def findSpecificAttrCounts(df, attrName):
  counts = df[attrName].value_counts().index.tolist()
  attrVals = {name: df[attrName].value_counts()[idx] for idx, name in enumerate(counts)}
  return attrVals

def calcRootEntropy(df, classAttrName):
  classes = findSpecificAttrCounts(df, classAttrName)
  totalVals = len(df)
  entropy = 0
  for val in classes:
    entropy += (-1 * classes[val] / totalVals) * (math.log2(classes[val] / totalVals))
  return entropy

def calcAttributeEntropy(df, attrName, classAttrName):
  attrVals = findSpecificAttrCounts(df, attrName)
  classes = findSpecificAttrCounts(df, classAttrName)
  totalVals = sum([attrVals[val] for val in attrVals])
  entropy = 0
  for val in attrVals:
    valCount = len(df[df[attrName] == val])
    specificValEntropy = 0
    for classVal in classes:
      instanceCount = len(df[(df[attrName] == val) & (df[classAttrName] == classVal)])
      if not instanceCount == 0:
        specificValEntropy += (-1 * instanceCount / valCount) * math.log2(instanceCount / valCount)
    entropy += (valCount / len(df)) * specificValEntropy
  return entropy

def calcSplitInfo(df, attrName):
  attrVals = findSpecificAttrCounts(df, attrName)
  totalVals = len(df)
  splitInfo = 0
  for val in attrVals:
    splitInfo += (-1 * attrVals[val] / totalVals) * math.log2(attrVals[val] / totalVals)
  return splitInfo

def calcGainRatio(df, attrName, classAttrName):
  return (calcRootEntropy(df, classAttrName) - calcAttributeEntropy(df, attrName, classAttrName)) / calcSplitInfo(df, attrName)

def isPureState(df, attrName, val, classAttrName):
  newDf = df[df[attrName] == val]
  classes = findSpecificAttrCounts(newDf, classAttrName)
  if len(classes) == 1:
    return True, [key for key in classes]
  return False, []

# Density Estimation Function
def estimate_density(df, attrName, radius=14.0):
  data = df[[attrName]].values
  nbrs = NearestNeighbors(radius=radius).fit(data)
  densities = np.array([len(nbrs.radius_neighbors([point], radius=radius, return_distance=False)[0]) for point in data])
  return densities.mean()

# Modified Gain Calculation with Density Adjustment
def calc_adjusted_score(df, attrName, classAttrName, weight1=0.8, weight2=0.2, radius=1.0):
  info_gain = calcRootEntropy(df, classAttrName) - calcAttributeEntropy(df, attrName, classAttrName)
  density = estimate_density(df, attrName, radius)
  return weight1 * info_gain + weight2 * density

In [None]:
# @title Control Decision Tree Building
def buildDecisionTree(df, decisionTree, classAttrName):
    attributes = df.columns[:-1]
    if len(attributes) == 0 or len(df[classAttrName].unique()) == 1:
        majorityClass = df[classAttrName].mode()[0]
        decisionTree[0] = majorityClass
        decisionTree[1] = {}
        return decisionTree
    maxGainRatio = calcGainRatio(df, attributes[0], classAttrName)
    maxAttr = attributes[0]
    for attr in attributes[1:]:
        gainRatio = calcGainRatio(df, attr, classAttrName)
        if gainRatio > maxGainRatio:
            maxAttr = attr
            maxGainRatio = gainRatio
    decisionTree[0] = maxAttr
    decisionTree[1] = {}
    attrVals = findSpecificAttrCounts(df, maxAttr)
    for val in attrVals:
        subset = df[df[maxAttr] == val]
        if subset.empty:
            majorityClass = df[classAttrName].mode()[0]
            decisionTree[1][val] = [majorityClass, {}]
        else:
            decisionTree[1][val] = ["", {}]
            isPure, classVal = isPureState(df, maxAttr, val, classAttrName)
            if isPure:
                decisionTree[1][val][0] = classVal[0]
            else:
                newSubset = subset.drop(columns=[maxAttr])
                buildDecisionTree(newSubset, decisionTree[1][val], classAttrName)
    return decisionTree

In [None]:
# @title Adaptive Decision Tree Building
# Adaptive Decision Tree Building with Inclusion of Density Estimation with KNN
def buildAdaptiveDecisionTree(df, decisionTree, classAttrName, maxDepth=10, currentDepth=0, minDensity=2, radius=1.0):
  if currentDepth >= maxDepth or len(df[classAttrName].unique()) == 1:
    majorityClass = df[classAttrName].mode()[0]
    decisionTree[0] = majorityClass
    decisionTree[1] = {}
    return decisionTree
  attributes = df.columns[:-1]
  max_score = float('-inf')
  best_attr = None
  for attr in attributes:
    score = calc_adjusted_score(df, attr, classAttrName, radius=radius)
    if score > max_score:
      best_attr = attr
      max_score = score

  decisionTree[0] = best_attr
  decisionTree[1] = {}
  attr_vals = findSpecificAttrCounts(df, best_attr)
  for val in attr_vals:
    subset = df[df[best_attr] == val]
    if subset.empty or len(subset) < minDensity:
      majorityClass = df[classAttrName].mode()[0]
      decisionTree[1][val] = [majorityClass, {}]
    else:
      decisionTree[1][val] = ["", {}]
      isPure, classVal = isPureState(df, best_attr, val, classAttrName)
      if isPure:
        decisionTree[1][val][0] = classVal[0]
      else:
        newSubset = subset.drop(columns=[best_attr])
        buildAdaptiveDecisionTree(newSubset, decisionTree[1][val], classAttrName, maxDepth, currentDepth + 1, minDensity)

  return decisionTree

In [None]:
# @title Tree to Text and Value Prediction
def convertTreeToText(decisionTree, text, stringClassName, tabCount):
  attr = decisionTree[0]
  keys = [key for key in decisionTree[1]]
  for key in keys:
    newText = "\n" + "  " * tabCount + "if " + attr + " is "  + key + ": "
    if decisionTree[1][key][1] == {}:
      newText += stringClassName + " is " + decisionTree[1][key][0]
    else:
      newText += convertTreeToText(decisionTree[1][key], "", stringClassName, tabCount + 1)
    text += newText
  return text

def predictValue(decisionTree, df, instanceNum, majorityCase):
  if decisionTree[1] == {}:
      return decisionTree[0]
  attrVal = df.iloc[instanceNum][decisionTree[0]]
  if attrVal not in decisionTree[1]:
      return majorityCase
  return predictValue(decisionTree[1][attrVal], df, instanceNum, majorityCase)

In [None]:
# @title Traffic Dataset Preprocessing
df = pd.read_csv("/content/Traffic_Accident_Prediction.csv")
x = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.30, stratify=y, random_state=3)
train = pd.concat([X_train, Y_train], axis=1)
test = pd.concat([X_test, Y_test], axis=1)
train.to_csv('/content/trafficTrain.csv', index=False)
test.to_csv('/content/trafficTest.csv', index=False)
traindf = pd.read_csv("/content/trafficTrain.csv")
traindf = traindf.replace(0, "0")
traindf = traindf.replace(1, "1")
traindf = traindf.replace(2, "2")
traindf = traindf.replace(3, "3")
traindf = traindf.replace(4, "4")
testdf = pd.read_csv("/content/trafficTest.csv")
testdf = testdf.replace(0, "0")
testdf = testdf.replace(1, "1")
testdf = testdf.replace(2, "2")
testdf = testdf.replace(3, "3")
testdf = testdf.replace(4, "4")

In [None]:
# @title Traffic Dataset Control Training
majorityClass = traindf['Accident_Severity'].mode()[0]
decisionTree = {}
decisionTree["root"] = ["", {}]
buildDecisionTree(traindf, decisionTree["root"], "Accident_Severity")
# print(convertTreeToText(decisionTree["root"], "", "Accident Severity", 0))

['Weather',
 {'0': ['Driver_Experience',
   {'2': ['Driver_Age',
     {'2': ['Road_Type',
       {'0': ['Road_Condition',
         {'0': ['Vehicle_Type',
           {'2': ['Time_of_Day',
             {'2': ['Accident', {'0': ['Moderate', {}], '1': ['Low', {}]}],
              '1': ['Low', {}],
              '0': ['Low', {}],
              '3': ['Low', {}]}],
            '3': ['Moderate', {}],
            '1': ['Moderate', {}]}],
          '2': ['Vehicle_Type',
           {'2': ['Moderate', {}],
            '1': ['Time_of_Day', {'1': ['Low', {}], '2': ['Moderate', {}]}],
            '3': ['Moderate', {}]}],
          '3': ['Time_of_Day',
           {'1': ['Low', {}], '2': ['Moderate', {}], '3': ['High', {}]}],
          '1': ['Moderate', {}]}],
        '1': ['Vehicle_Type',
         {'2': ['Traffic_Density',
           {'2': ['Road_Condition',
             {'0': ['Time_of_Day',
               {'2': ['Speed_Limit',
                 {'0': ['Number_of_Vehicles',
                   {'0': ['

In [None]:
# @title Traffic Dataset Control Testing
correct = 0
l_l, l_m, l_h = 0, 0, 0
m_l, m_m, m_h = 0, 0, 0
h_l, h_m, h_h = 0, 0, 0
for i in range(len(traindf)):
    predicted = predictValue(decisionTree["root"], traindf, i, majorityClass)
    actual = traindf.iloc[i]['Accident_Severity']
    if predicted == actual:
      correct += 1
      if predicted == "Low": l_l += 1
      if predicted == "Moderate": m_m += 1
      if predicted == "High": h_h += 1
    elif predicted == "Low":
      if actual == "Moderate":
        l_m += 1
      else:
        l_h += 1
    elif predicted == "Moderate":
      if actual == "Low":
        m_l += 1
      else:
        m_h += 1
    else:
      if actual == "Low":
        h_l += 1
      else:
        h_m += 1
accuracy = correct / len(traindf)
print("Control Decision Tree Info:")
print(f"Training Accuracy: {accuracy:.3f}")
print(f"Training Confusion Matrix:\n[[{l_l}, {m_l}, {h_l}]\n[{l_m}, {m_m}, {h_m}]\n[{l_h}, {m_h}, {h_h}]]")
print()

correct = 0
l_l, l_m, l_h = 0, 0, 0
m_l, m_m, m_h = 0, 0, 0
h_l, h_m, h_h = 0, 0, 0
for i in range(len(testdf)):
    predicted = predictValue(decisionTree["root"], testdf, i, majorityClass)
    actual = testdf.iloc[i]['Accident_Severity']
    if predicted == actual:
      correct += 1
      if predicted == "Low": l_l += 1
      if predicted == "Moderate": m_m += 1
      if predicted == "High": h_h += 1
    elif predicted == "Low":
      if actual == "Moderate":
        l_m += 1
      else:
        l_h += 1
    elif predicted == "Moderate":
      if actual == "Low":
        m_l += 1
      else:
        m_h += 1
    else:
      if actual == "Low":
        h_l += 1
      else:
        h_m += 1
accuracy = correct / len(testdf)
print(f"Testing Accuracy: {accuracy:.3f}")
print(f"Testing Confusion Matrix:\n[[{l_l}, {m_l}, {h_l}]\n[{l_m}, {m_m}, {h_m}]\n[{l_h}, {m_h}, {h_h}]]")

Control Decision Tree Info:
Training Accuracy: 0.995
Training Confusion Matrix:
[[333, 0, 1]
[2, 167, 0]
[0, 0, 55]]

Testing Accuracy: 0.492
Testing Confusion Matrix:
[[92, 34, 18]
[40, 25, 7]
[15, 8, 1]]


In [None]:
# @title Traffic Dataset Adaptive Training
majorityClass = testdf['Accident_Severity'].mode()[0]
decisionTree = {}
decisionTree["root"] = ["", {}]
buildAdaptiveDecisionTree(testdf, decisionTree["root"], "Accident_Severity", maxDepth=10, radius=14.0)
# print(convertTreeToText(decisionTree["root"], "", "Accident Severity", 0))

['Road_Light_Condition',
 {'1': ['Driver_Alcohol',
   {'0': ['Accident',
     {'0': ['Speed_Limit',
       {'0': ['Number_of_Vehicles',
         {'0': ['Vehicle_Type',
           {'2': ['Traffic_Density',
             {'1': ['Weather',
               {'1': ['Driver_Experience',
                 {'2': ['Low', {}], '1': ['Low', {}], '0': ['Low', {}]}],
                '0': ['Road_Type',
                 {'0': ['Time_of_Day',
                   {'1': ['Moderate', {}], '2': ['Low', {}]}],
                  '1': ['Low', {}]}],
                '4': ['Low', {}]}],
              '2': ['Road_Type',
               {'1': ['Driver_Age',
                 {'3': ['Low', {}], '0': ['Low', {}], '2': ['Low', {}]}],
                '2': ['Time_of_Day', {'0': ['Low', {}], '2': ['Low', {}]}],
                '0': ['Low', {}]}],
              '0': ['Road_Condition',
               {'0': ['Low', {}], '2': ['Low', {}], '1': ['Low', {}]}]}],
            '3': ['Road_Condition',
             {'0': ['Low', {}], '

In [None]:
# @title Traffic Dataset Adaptive Testing
correct = 0
l_l, l_m, l_h = 0, 0, 0
m_l, m_m, m_h = 0, 0, 0
h_l, h_m, h_h = 0, 0, 0
for i in range(len(traindf)):
    predicted = predictValue(decisionTree["root"], traindf, i, majorityClass)
    actual = traindf.iloc[i]['Accident_Severity']
    if predicted == actual:
      correct += 1
      if predicted == "Low": l_l += 1
      if predicted == "Moderate": m_m += 1
      if predicted == "High": h_h += 1
    elif predicted == "Low":
      if actual == "Moderate":
        l_m += 1
      else:
        l_h += 1
    elif predicted == "Moderate":
      if actual == "Low":
        m_l += 1
      else:
        m_h += 1
    else:
      if actual == "Low":
        h_l += 1
      else:
        h_m += 1
accuracy = correct / len(traindf)
print("Adaptive Decision Tree Info:")
print(f"Training Accuracy: {accuracy:.3f}")
print(f"Training Confusion Matrix:\n[[{l_l}, {m_l}, {h_l}]\n[{l_m}, {m_m}, {h_m}]\n[{l_h}, {m_h}, {h_h}]]")
print()

correct = 0
l_l, l_m, l_h = 0, 0, 0
m_l, m_m, m_h = 0, 0, 0
h_l, h_m, h_h = 0, 0, 0
for i in range(len(testdf)):
    predicted = predictValue(decisionTree["root"], testdf, i, majorityClass)
    actual = testdf.iloc[i]['Accident_Severity']
    if predicted == actual:
      correct += 1
      if predicted == "Low": l_l += 1
      if predicted == "Moderate": m_m += 1
      if predicted == "High": h_h += 1
    elif predicted == "Low":
      if actual == "Moderate":
        l_m += 1
      else:
        l_h += 1
    elif predicted == "Moderate":
      if actual == "Low":
        m_l += 1
      else:
        m_h += 1
    else:
      if actual == "Low":
        h_l += 1
      else:
        h_m += 1
accuracy = correct / len(testdf)
print(f"Testing Accuracy: {accuracy:.3f}")
print(f"Testing Confusion Matrix:\n[[{l_l}, {m_l}, {h_l}]\n[{l_m}, {m_m}, {h_m}]\n[{l_h}, {m_h}, {h_h}]]")

Adaptive Decision Tree Info:
Training Accuracy: 0.520
Training Confusion Matrix:
[[269, 43, 22]
[139, 18, 12]
[47, 5, 3]]

Testing Accuracy: 0.733
Testing Confusion Matrix:
[[131, 6, 7]
[38, 31, 3]
[9, 1, 14]]
