In [2]:
# Imports & Setup
# ---------------
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA

pd.set_option("display.precision", 3)

In [None]:
# Data Analysis
# -------------

t = dict(pd.read_csv("training-data-10-12-2023_0-10-30.csv", header = 0, delimiter = ",").dtypes)
for key in t.keys():
    t[key] = str if type(t[key]) == 'object' else int

df1 = pd.read_csv("training-data-10-12-2023_0-10-30.csv", header = 0, delimiter = ",", dtype=t)
df2 = pd.read_csv("training-data-10-12-2023_0-10-36.csv", header = 0, delimiter = ",", dtype=t)
df3 = pd.read_csv("training-data-10-12-2023_0-10-40.csv", header = 0, delimiter = ",", dtype=t)
df = pd.concat([df1, df2, df3])

nominal_cols = df.select_dtypes(include=['object']).columns
print(nominal_cols)
df[nominal_cols] = df[nominal_cols].astype('string', errors='ignore')
print(df.dtypes)
# df = pd.read_csv("data-cleaned\TRAININGSLICE1.csv", header = 0, delimiter = ",")

# Feature stats
rows, cols = df.shape
print(f'Features: {cols}, Entries: {rows}')
stats = pd.DataFrame([df.dtypes, df.nunique(), df.nunique()/rows, rows-df.count(), (rows-df.count())*100/rows], index=['dtype', '# unique', '% unique', '# NaN', '% NaN'])
stats.T.to_csv('stats.csv', sep=',')
display(stats)

# Remove features that have a lot of missing values
filt = []
s = stats.to_numpy()
for i in range(cols):
    if s[4, i] > 20.0:
        filt.append(stats.columns[i])
    
df = df.drop(filt, axis=1).dropna()

rows, cols = df.shape
print(f'Features: {cols}, Entries: {rows}')



In [1]:
# K-nn Testing
# ------------

import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA

pd.set_option("display.precision", 3)

df = pd.read_csv("TRAININGSLICEALL.csv")


# Data split
ds = df.to_numpy()[:100]
print(ds.shape)
X = ds[:, 1:-1] # ignore 3dmark id
# X = pd.get_dummies(X) # one-hot encode (categorical columns only)
y = ds[:, -1]

(100, 3922)


In [5]:
'''
Train clf on training-test split of data X, y. Return the score and MAE.
'''
def train_mae(clf, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    mae = mean_absolute_error(y_pred, y_test)
    return score, mae

In [10]:
# Data split
ds = df.to_numpy()
print(ds.shape)
X = ds[:, :-1] # ignore 3dmark id
y = ds[:, -1]

# default k-nn
knn = KNeighborsRegressor(n_neighbors=5)
score, mae = train_mae(knn, X, y)
print(f'default: score={score}, MAE={mae}')

(123652, 3922)
default: score=0.0, MAE=8.087016295337835e-06


In [None]:
ds_norm = normalize(ds)
X = ds_norm[:, :-1]
y = ds_norm[:, -1]

knn2 = KNeighborsRegressor(n_neighbors=5)
score, mae = train_mae(knn2, X, y)
print(f'default: score={score}, MAE={mae}')