# Example

Given various attributes of a diamond, use Support Vector Regressor to predict the price of Diamond.

In [None]:
# Imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Section A

Exploratory Data Analysis

In [None]:
df = pd.read_csv("../input/diamonds/diamonds.csv")

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df = df.drop(columns="Unnamed: 0")

In [None]:
df.head()

The columns carat, cut, color, clarity, depth, table appear to be important columns to determine the price of a diamond. The values of cut, color, and clarity are discrete or qualitative in nature, so let's convert them into numerical form or quantitative form.

In [None]:
df = df[['carat', 'cut', 'color', 'clarity', 'depth', 'price']]

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
# Time to look into distinct values of qualitative columns
from collections import Counter
Counter(df['cut'])

In [None]:
sns.countplot(x='cut', data=df)

In [None]:
Counter(df['color'])

In [None]:
sns.countplot(x='color', data=df)

In [None]:
Counter(df['clarity'])

In [None]:
sns.countplot(x='clarity', data=df)

To transform the qualitative features into quantitative ones, we need to change the values with the help of a dictionary object of each feature.

cut = {'Ideal': 1,
         'Premium': 2,
         'Good': 3,
         'Very Good': 4,
         'Fair': 5}

color = {'E': 1,
         'I': 2,
         'J': 3,
         'H': 4,
         'F': 5,
         'G': 6,
         'D': 7}

clarity = {'SI2': 1,
         'SI1': 2,
         'VS1': 3,
         'VS2': 4,
         'VVS2': 5,
         'VVS1': 6,
         'I1': 7,
         'IF': 8}

In [None]:
cut = {'Ideal': 1, 'Premium': 2, 'Good': 3, 'Very Good': 4, 'Fair': 5}

color = {'E': 1, 'I': 2, 'J': 3, 'H': 4, 'F': 5, 'G': 6, 'D': 7}

clarity = {'SI2': 1, 'SI1': 2, 'VS1': 3, 'VS2': 4, 'VVS2': 5, 'VVS1': 6, 'I1': 7, 'IF': 8}

In [None]:
df["cut"] = df["cut"].map(cut)
df["color"] = df["color"].map(color)
df["clarity"] = df["clarity"].map(clarity)

In [None]:
df.head()

In [None]:
from sklearn.utils import shuffle
shuffle(df)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Let's also perform Standard Scaling on various features
from sklearn.preprocessing import StandardScaler
scaled_features = StandardScaler().fit_transform(df.values)

scaled_features_df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)

In [None]:
scaled_features_df.head()

In [None]:
df.head()

In [None]:
# Let's review the outliers
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

fig, axs = plt.subplots(5, figsize = (5,7))
plt1 = sns.boxplot(scaled_features_df['carat'], ax = axs[0]).set_title('carat')
plt2 = sns.boxplot(scaled_features_df['cut'], ax = axs[1]).set_title('cut')
plt3 = sns.boxplot(scaled_features_df['color'], ax = axs[2]).set_title('color')
plt4 = sns.boxplot(scaled_features_df['clarity'], ax = axs[3]).set_title('clarity')
plt5 = sns.boxplot(scaled_features_df['depth'], ax = axs[4]).set_title('depth')
plt.tight_layout()

The data looks well distributed.

In [None]:
# Let's see how quality is related with other variables using scatter plot. - df.columns
sns.pairplot(scaled_features_df, 
             x_vars=['carat', 'cut', 'color', 'clarity', 'depth'], 
             y_vars=['carat', 'cut', 'color', 'clarity', 'depth', 'price'], 
             kind='scatter')
plt.tight_layout()

carat and depth seem to be very important features for price determination.

In [None]:
# Let's do correlation heatmap
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(scaled_features_df.corr(), annot = True, ax=ax)
plt.tight_layout()

carat is a significant determinant of price, however carat is not correlated to another variable. Let's also look at Variance Inflation Factors (VIFs).

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
  
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = scaled_features_df.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(scaled_features_df.values, i)
                          for i in range(len(scaled_features_df.columns))]
  
print(vif_data)

# Section B

Building SVR model

In [None]:
scaled_features_df.columns

In [None]:
# Step 4: ML Model
from sklearn.model_selection import train_test_split
X = scaled_features_df[['carat', 'cut', 'color', 'clarity', 'depth']]
y = scaled_features_df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
from sklearn.svm import SVR
model = SVR()
model.fit(X_train, y_train)

In [None]:
model_predictions = model.predict(X_test)

In [None]:
print(model.score(X_test, y_test))

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, r2_score
print('MAE: ', mean_absolute_error(y_test, model_predictions))
print('MSE: ', mean_squared_error(y_test, model_predictions))
print('EVS: ', explained_variance_score(y_test, model_predictions))
print('R2 Score: ', r2_score(y_test, model_predictions))
rmse = mean_squared_error(y_test, model_predictions, squared=False)
print('RMSE: ', rmse)

# Conclusion

The RMSE, MSE, and MAE are very low in this case. The feature scaling worked beautifully here and SVR created a ML model which has capability to predict the price very accurately.