In [1]:
import pandas as pd

df = pd.read_csv("datasets/diamonds.csv", index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [2]:
df["cut"].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [3]:
# Need to convert Cut into numeric values as it's an important paramter
df["cut"].astype("category").cat.codes

1        2
2        3
3        1
4        3
5        1
        ..
53936    2
53937    1
53938    4
53939    3
53940    2
Length: 53940, dtype: int8

In [4]:
# The above method assigns an arbitrary number however, we want the cut to be meaningful e.g. Premium is better than Fair
cut_class_dict = {"Fair": 1, "Good": 2, "Very Good": 3, "Premium": 4, "Ideal": 5}
clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J": 1, "I": 2, "H":3, "G": 4, "F": 5, "E":6, "D": 7}

In [5]:
df["cut"] = df["cut"].map(cut_class_dict)
df["clarity"] = df["clarity"].map(clarity_dict)
df["color"] = df["color"].map(color_dict)

In [6]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [15]:
import sklearn
from sklearn import svm, preprocessing
from sklearn.model_selection import train_test_split

df = sklearn.utils.shuffle(df)

X = df.drop("price", axis=1).values
X = preprocessing.scale(X)
y = df["price"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)

In [16]:
# Train the model
clf = svm.SVR(kernel="linear")
clf.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [17]:
clf.score(X_test, y_test)

0.8581605994813564

In [19]:
for X, y in zip(X_test, y_test):
    print(f"Model: {clf.predict([X])[0]}, Actual: {y}")

Model: 2488.119640585948, Actual: 2168
Model: 8618.551626213666, Actual: 9813
Model: 4178.688967489258, Actual: 3920
Model: 3591.1440183707064, Actual: 3081
Model: 284.5163361565974, Actual: 666
Model: 2495.4441900151646, Actual: 2039
Model: 2882.479927251996, Actual: 2756
Model: 11514.43248483158, Actual: 15330
Model: 7354.929101500045, Actual: 7780
Model: 11780.639476813496, Actual: 13205
Model: 495.13426255374907, Actual: 767
Model: 1974.071753109597, Actual: 1349
Model: 3422.2323629981374, Actual: 2143
Model: 589.4602200183663, Actual: 942
Model: 6154.055497304741, Actual: 6421
Model: 5929.621133890977, Actual: 9521
Model: 984.8338930989662, Actual: 810
Model: 533.7074468390592, Actual: 789
Model: 368.3088455649422, Actual: 530
Model: 691.2024334879525, Actual: 907
Model: 13616.329428911367, Actual: 16592
Model: 3106.9787801580933, Actual: 2608
Model: 3880.8789259906803, Actual: 3344
Model: 4196.783476132728, Actual: 4531
Model: 1237.8883718439083, Actual: 821
Model: 4253.486550461

In [20]:
# Train the model
clf = svm.SVR(kernel="rbf")
clf.fit(X_train, y_train)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [22]:
clf.score(X_test, y_test)

0.4871574442344966

In [23]:
for X, y in zip(X_test, y_test):
    print(f"Model: {clf.predict([X])[0]}, Actual: {y}")

Model: 2527.7093688812365, Actual: 2168
Model: 5812.294911932366, Actual: 9813
Model: 3868.8599879690055, Actual: 3920
Model: 3217.1479851774466, Actual: 3081
Model: 1217.4371243771284, Actual: 666
Model: 2012.625610083517, Actual: 2039
Model: 2792.671010683187, Actual: 2756
Model: 3936.012851835964, Actual: 15330
Model: 6333.393588793335, Actual: 7780
Model: 5160.394300670017, Actual: 13205
Model: 1315.5468249807361, Actual: 767
Model: 1613.8416567049035, Actual: 1349
Model: 3280.9689361836868, Actual: 2143
Model: 716.5406494983472, Actual: 942
Model: 5418.556431742692, Actual: 6421
Model: 5208.32222749382, Actual: 9521
Model: 1358.4573558948825, Actual: 810
Model: 668.3419422347861, Actual: 789
Model: 1136.4545702393439, Actual: 530
Model: 945.3034404490313, Actual: 907
Model: 4481.694048406474, Actual: 16592
Model: 2866.1842164524787, Actual: 2608
Model: 3572.7622260409025, Actual: 3344
Model: 4141.754864091485, Actual: 4531
Model: 1280.811367585354, Actual: 821
Model: 3895.09204386