In [1]:
import pandas as pd

In [9]:
df = pd.read_csv('../datasets/diamonds.csv', index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [10]:
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [11]:
# create dictionary
cut_class_dict = {"Fair": 1, "Good": 2, "Very Good": 3, "Premium": 4, "Ideal": 5}
clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J": 1,"I": 2,"H": 3,"G": 4,"F": 5,"E": 6,"D": 7}

df['cut'] = df['cut'].map(cut_class_dict)
df['clarity'] = df['clarity'].map(clarity_dict)
df['color'] = df['color'].map(color_dict)

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [16]:
import sklearn
from sklearn import svm, preprocessing

In [19]:
df = sklearn.utils.shuffle(df)

X = df.drop('price', axis=1).values
X = preprocessing.scale(X)
y = df['price'].values

test_size = 200

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

clf = svm.SVR(kernel='linear')
clf.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [20]:
clf.score(X_test, y_test)

0.8672466448115043

In [21]:
for X,y in zip(X_test, y_test):
    print(f"Model: {clf.predict([X])[0]}, Actual: {y}")

Model: 198.22314466250282, Actual: 776
Model: 4911.555198240467, Actual: 3763
Model: 4701.036421847554, Actual: 4102
Model: 12543.250444533702, Actual: 18760
Model: 4234.152243467452, Actual: 4234
Model: 1162.3988495766243, Actual: 1000
Model: 3238.6732584972337, Actual: 2444
Model: 6719.089973642544, Actual: 8557
Model: 9569.97167095251, Actual: 9165
Model: 5589.91852074186, Actual: 4044
Model: 1324.6978488712066, Actual: 1076
Model: 2198.497667668706, Actual: 2218
Model: 6195.305680314427, Actual: 7273
Model: 448.75392099497503, Actual: 687
Model: 879.3981636291387, Actual: 982
Model: 664.8223386705249, Actual: 926
Model: 4910.414391590085, Actual: 5784
Model: 6904.513263664132, Actual: 7222
Model: 13451.01147447138, Actual: 14220
Model: 5232.34386195492, Actual: 5495
Model: 556.444822640965, Actual: 878
Model: 1597.8168299265226, Actual: 1206
Model: 333.369045192042, Actual: 844
Model: 3457.7056532224774, Actual: 2720
Model: 2646.215775811962, Actual: 1250
Model: 11951.24814501402, 

In [22]:
clf = svm.SVR(kernel='rbf')
clf.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [23]:
print(clf.score(X_test, y_test))

for X,y in zip(X_test, y_test):
    print(f"Model: {clf.predict([X])[0]}, Actual: {y}")

0.556688327118626
Model: 680.1306334468868, Actual: 776
Model: 4312.5704871039, Actual: 3763
Model: 4802.168738209678, Actual: 4102
Model: 5444.041808839835, Actual: 18760
Model: 4139.569412965818, Actual: 4234
Model: 890.8770212104855, Actual: 1000
Model: 3136.618082933882, Actual: 2444
Model: 5938.3234934546745, Actual: 8557
Model: 6541.835983442123, Actual: 9165
Model: 4041.8696588980642, Actual: 4044
Model: 1039.0261995860524, Actual: 1076
Model: 2176.885568612427, Actual: 2218
Model: 5507.550167372362, Actual: 7273
Model: 749.4001573496662, Actual: 687
Model: 1208.2109239173305, Actual: 982
Model: 1048.6775965677493, Actual: 926
Model: 3853.8035346290385, Actual: 5784
Model: 5951.4362455363225, Actual: 7222
Model: 4806.860852326745, Actual: 14220
Model: 5283.739679906704, Actual: 5495
Model: 594.3301636547221, Actual: 878
Model: 1732.2794365512254, Actual: 1206
Model: 708.7511716383424, Actual: 844
Model: 3435.763155321486, Actual: 2720
Model: 3086.087530960346, Actual: 1250
Model