In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score as acc
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn import tree
from graphviz import Source as src
from sklearn.tree import DecisionTreeClassifier as dtc

In [None]:
# Read CSV file
csv_data = pd.read_csv("RAM_Benchmarks_megalist.csv")

# The columns we re interested in
interested_cols = ["gen","latency","readUncached","write","price"]

# Create dataset with the columns
dataset = csv_data[interested_cols]

# Remove empty columns
for col in interested_cols:
    dataset = dataset[dataset[col].notna()]

# reset index cause we removed columns
dataset.reset_index(inplace=True, drop=True)

# Change "gen" from string to numeric method
def change_gen_data(x):
    if x=="DDR5":
        return 3
    elif x=="DDR4":
        return 2
    elif x=="DDR3":
        return 1
    elif x=="DDR2":
        return 0
# apply method
dataset.loc[:,"gen"] = dataset.loc[:,"gen"].apply(change_gen_data)

# Change string to float
def change_to_float(x):
    return float(x.replace(',', ''))
# apply method
dataset.loc[:,"readUncached"] = dataset.loc[:,"readUncached"].apply(change_to_float)
dataset.loc[:,"write"] = dataset.loc[:,"write"].apply(change_to_float)

dataset.head()

In [None]:
dataset

In [None]:
# pair plot to check the data to ensure something did not go missing....
sb.pairplot(dataset,hue="gen")
plt.show()

In [None]:
# x contains eveything but gen
x_data = dataset.iloc[:,1:]
# y contains gen only
y_data = dataset.iloc[:,0]

# randomize x and y test and train data
x_train, x_test, y_train, y_test = tts(x_data,y_data, random_state=204)


print(x_train)
print()
print(x_test)
print()
print(y_train)
print()
print(y_test)

In [None]:
# generate a DecisionTreeClassifier model and train it with train data
dt = dtc(max_depth = 4, random_state = 204)
dt.fit(x_train, y_train)

In [None]:
# predict y with x_test data
y_pred = dt.predict(x_test)

In [None]:
# print accuracy
acc(y_test, y_pred)

In [None]:
# Draw out the decission tree
src(tree.export_graphviz(dt, out_file=None, class_names=["DDR2", "DDR3","DDR4","DDR5"], feature_names= x_train.columns)) # display the tree, with no output file

In [None]:
max_accuracy = 0
temp = ""

for rand in range(0,300):
    x_train, x_test, y_train, y_test = tts(x_data,y_data, random_state=rand)

    for n in range(1,30):
        dt = dtc(max_depth = n, random_state = rand)
        dt.fit(x_train, y_train)
        y_pred = dt.predict(x_test)
        accuracy = acc(y_test,y_pred)
        if(max_accuracy < accuracy):
            max_accuracy = accuracy
            temp = "When rand: :", rand," n: ",n, " acc: ", accuracy
        print("When rand: :", rand," n: ",n, " acc: ", accuracy)
    
print()
print()
print(temp)