In [1]:
import pandas as pd

df = pd.read_csv("dataset/diamonds.csv", index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [2]:
df["cut"].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [3]:
# df["cut"].astype("category").cat.codes
# can be used to convert cut to numbers but for classification

In [4]:
# using dictionaries as it is a regression problem
cut_class_dict = {"Fair": 1, "Good": 2, "Very Good": 3, "Premium": 4, "Ideal": 5}
clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J": 1,"I": 2,"H": 3,"G": 4,"F": 5,"E": 6,"D": 7}

df['cut'] = df['cut'].map(cut_class_dict)
df['clarity'] = df['clarity'].map(clarity_dict)
df['color'] = df['color'].map(color_dict)

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [6]:
import sklearn
from sklearn import svm
from sklearn import preprocessing


# Shuffle the dataset
df = sklearn.utils.shuffle(df)
# Featue set
X = df.drop("price", axis=1).values
X = preprocessing.scale(X)
# price stored
y = df['price'].values


test_size = 200

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

clf = svm.SVR(kernel="linear")
clf.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [8]:
clf.score(X_test, y_test)

0.8797599398354586

In [9]:
# Checking how much we are near to the true values

for X,y in zip(X_test, y_test):
    print(f"Model:{clf.predict([X])[0]}, Actual: {y}")

Model:5313.769684095642, Actual: 5766
Model:1261.112211355051, Actual: 970
Model:3440.695933401, Actual: 2780
Model:2868.95743741995, Actual: 2054
Model:2394.0262458427933, Actual: 2164
Model:6299.528207644268, Actual: 6729
Model:-45.80624491539493, Actual: 438
Model:262.82439388149805, Actual: 478
Model:3151.6856180546024, Actual: 2805
Model:-121.42611606795344, Actual: 417
Model:2368.4140883371897, Actual: 1840
Model:8746.509389250621, Actual: 9823
Model:12770.0280281201, Actual: 13812
Model:-179.72793812639475, Actual: 408
Model:1859.2327935915814, Actual: 1367
Model:3544.488027435974, Actual: 3103
Model:1817.4188647384083, Actual: 1223
Model:471.95637777606544, Actual: 702
Model:3603.1207432415995, Actual: 3718
Model:8814.610078410164, Actual: 14294
Model:295.0826619671466, Actual: 756
Model:2600.208912212104, Actual: 2265
Model:1727.793682394034, Actual: 2052
Model:764.8655012139952, Actual: 805
Model:11862.171818589712, Actual: 12541
Model:2420.867321484159, Actual: 2494
Model:58

In [10]:
clf = svm.SVR(kernel="rbf")
clf.fit(X_train, y_train)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [11]:
clf.score(X_test, y_test)

for X,y in zip(X_test, y_test):
    print(f"Model:{clf.predict([X])[0]}, Actual: {y}")

Model:4947.177584223717, Actual: 5766
Model:1332.0525106395735, Actual: 970
Model:3347.836205171712, Actual: 2780
Model:2903.6652248999967, Actual: 2054
Model:2964.1969783945246, Actual: 2164
Model:6126.069388765578, Actual: 6729
Model:725.2633116542652, Actual: 438
Model:1469.651028011164, Actual: 478
Model:3015.458428158744, Actual: 2805
Model:491.611570761605, Actual: 417
Model:2434.6082672402827, Actual: 1840
Model:6167.6268575358035, Actual: 9823
Model:5833.742918228785, Actual: 13812
Model:1204.3842861269186, Actual: 408
Model:1409.073047575649, Actual: 1367
Model:3380.7330999667097, Actual: 3103
Model:1374.194136448497, Actual: 1223
Model:656.4810621894012, Actual: 702
Model:3327.3309390691606, Actual: 3718
Model:7790.7920897567, Actual: 14294
Model:885.2379250515669, Actual: 756
Model:2556.643928693622, Actual: 2265
Model:1475.5918078564885, Actual: 2052
Model:1013.4655180221216, Actual: 805
Model:6061.536423824184, Actual: 12541
Model:2520.7724283467132, Actual: 2494
Model:646