In [3]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation


data = pd.read_csv("C:/Users/Lenovo/Desktop/sem5/da/project/Accidents_categorical.csv")


In [15]:
def preprop(data, to_drop=[], na_thresh = 0.25):
    log = []
    for i in to_drop:                           # Explicitly drop specified columns (intuition and logical reasoning) 
        data = data.drop(i, axis=1)
        log.append("Dropped column " + i + " as requested")

    report = getStats(data)
    # print(report)
    replace_index = {}
    for i in report:
        # print(i)
        if i == "uncertain":            # report has a class of uncertain columns; Do nothing to them
            continue
        
        # If numerical column and less than threshold percentage is empty, fill it with mean of column
        if report[i]["type"] == 'num' and report[i]["na"] > 0 and report[i]["na"] < na_thresh :
            data[i] = data[i].fillna(data[i].mean())
            log.append(i + " : replaced Nan/Na with mean")
            
        #If in any column, more than threshold percentage is empty, drop such columns
        elif (report[i]["type"] == 'num' or report[i]["type"] == 'cat') and report[i]["na"] >= na_thresh :
            # print(data.columns)
            data = data.drop(i, axis=1)
            log.append("Dropped column " + i + " as Nan/Na ratio (" + str(report[i]["na"]) + ") > " + str(na_thresh))
            
            
        # If column is categorical, encode them with numerical classes    
        if report[i]["type"] == 'cat' and i in data.columns:
            replace_index[i] = getEncodings(data[i])
            log.append("Encoded column " + i)

    data = data.replace(replace_index)


    return data, log, replace_index # Returning cleaned data, log of changes made and categorical encodings dict (if any)



def getEncodings(o):
    o = list(set(o))
    o = sorted(o)
    enc = {cls: ind for ind, cls in enumerate(o)}
    return enc


def getStats(data):
    cols = data.columns
    report = {}
    report['uncertain'] = []
    for i in cols:
        report[i] = {}
        report[i]["na"] = data[i].isna().sum() / len(data)
        if "unique" in str(data[i].describe()):
            if len(data[i].unique()) <  0.25 * len(data):
                report[i]["type"] = "cat"
                report[i]["uniq"] = data[i].unique()
                report[i]["uniq_no"] = len(data[i].unique())
            else:
                report["uncertain"].append(i)
        else:
            report[i]["type"] = "num"
    return report


# Accident_Index and Datetime provide no useful information
prepro_data, logs, encs = preprop(data, to_drop=["Accident_Index", "Datetime"])  
#print("\n".join(logs))

In [10]:
cols = list(prepro_data.columns)
feature_cols=[]
for i in range(0,len(cols)-1):
    feature_cols.append(cols[i])
#print(feature_cols)
X = prepro_data[feature_cols] # Features
y = prepro_data.Accident_Severity # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6694506949040371
