In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Diamond Price Prediction

**Diamond is the only gem made of a single element: It is typically about 99.95 percent carbon. The other 0.05 percent can include one or more trace elements.Diamonds were formed billions of years ago and are extremely rare because so few are able to survive the difficult journey from the pits of the earth to reach the earth's surface.**

# Aim of The Kernel
**Aim of the this kernel is to predict diamond price (target) with respect to predictor varaibles of given diamond data set and making proper exploratory data analysis.** 

![](https://k9f7a9j9.rocketcdn.me/wp-content/uploads/2020/11/shutterstock_99204875.jpg)

# Exploratory Data Analysis

In [None]:

#importing required data manupilation libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("/kaggle/input/diamonds/diamonds.csv") #loading the diamond data.

In [None]:
df.info() # our data set contains 8 numerical and 3 categorical variables.

In [None]:
df.shape # the data has 53940 rows of observations

In [None]:
df.columns # names of the features

In [None]:
df.head(10) # first look to the data. unnamed:0 column seems like unnecessary index. We might consider droping it.

In [None]:
df.describe().T 
# descriptive statistics of numerical features.
#price feature has huge difference of min and max values. There might be outliers.

In [None]:
df.isna().sum() # there is no missing value in the data.

In [None]:
df.drop('Unnamed: 0',axis = 1, inplace = True)  #  drop unnecessary index.
corr=df.corr()
sns.heatmap(corr,vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True, annot = True
)
#Correlation table of numerical features.
#Strong correlation between carat and price draws attention .


In [None]:
sns.boxplot(x="carat",data=df,color ="lime"); # box plot of carat feature. There are a lot of outliers.

In [None]:
sns.distplot(df.carat, hist=True, kde=True,color = "green") #distribution plot of carat feature.
#The distribution is left skewed due to outliers.

In [None]:
sns.boxplot(x="price",data=df,color ="cyan") # Price column also has many outliers.

In [None]:
sns.distplot(df.price, hist=True, kde=True,color = "darkblue") # skewed to left as well.

In [None]:
df.cut.unique()
# quality of the cut (Fair, Good, Very Good, Premium, Ideal) ordinal variable.

In [None]:
df.cut.value_counts() # count values of cut types.

In [None]:
sns.factorplot(x='cut', data=df , kind='count',aspect=2.5 ) # bar plot of cut feature count.
#most of diamonds that in the data is ideal cut.

In [None]:
df.color.unique()
# there are 7 different color of diamonds in the data.

In [None]:
df.color.value_counts()
# value counts of color of diamonds.

In [None]:
sns.factorplot(x='color', data=df , kind='count',aspect=2.5 )
#barplot of diamond colors. 
#color diamond from  D (best) to J (worst) 
# 'G' color is the most common type.

In [None]:
df.groupby('color').carat.mean()
  #grouping diamonds according to their colors then looking their carat means.

In [None]:
df.groupby('color').carat.mean() 

In [None]:
df.groupby('color').carat.mean().plot(kind = 'bar', 
                                      color=['black', 'red', 'green', 'blue', 'cyan','purple','yellow'],
                                      xlabel = 'color',
                                      ylabel="carat")
#color diamond from  D (best) to J (worst) 
#bar plot of the carat means with respect to colors.

In [None]:
CutPrice = df.groupby('cut').price.mean()
CutPrice
#mean prices of cut types

In [None]:
CutPrice.plot(kind='pie',figsize=(7,7))
#pie plot of mean prices of cut types
#Premium is the most high one.

# Outlier Elemination
In order to determine outliers we use box plots to see from which point outlier observations starts.

In [None]:
sns.boxplot(x="carat",data=df,color ="cyan")
# it seems like above of 3 value are outliers.

In [None]:
sns.boxplot(df.price,color ="purple")
#outliers of price column starts from 13000.

In [None]:
sns.boxplot(x="table",data=df,color ="red")
#outlier observation of table feature

In [None]:
#appyling outlier elemination by filtering data points
df = df[(df["price"]< 14500)]
df = df[(df["carat"]< 3)]
df = df[(df["table"]< 64)&(df["table"]>50)]

In [None]:
sns.boxplot(x="table",data=df,color ="red") # box plot of table column after outlier elemination

In [None]:
df.shape 

In [None]:
diamond = df.copy() #copying the dataframe before categorical feature encoding.

# Categorical Feature Encoding

In [None]:
# We will use labelencoding to encode out categorical features.
from sklearn import preprocessing

label_cut = preprocessing.LabelEncoder()
label_color = preprocessing.LabelEncoder()
label_clarity = preprocessing.LabelEncoder()


diamond['cut'] = label_cut.fit_transform(diamond['cut'])
diamond['color'] = label_color.fit_transform(diamond['color'])
diamond['clarity'] = label_clarity.fit_transform(diamond['clarity'])

In [None]:
diamond.head(7) #Let's have a look to the data after encoding.

# Train - Validation Split

In [None]:
y = diamond['price']  #target variable

features = ['carat', 'cut', 'color', 'clarity',
            'depth', 'table', 'price']  #predictor variables

X = diamond[features]

In [None]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.3, random_state=2021)

# Building Models

In [None]:
#importing standartizer and regression models and their error metrics
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

In [None]:
# Building pipelines of different regressors and standartize the training set.

pipeLR= Pipeline([("std_scalar1",StandardScaler()),
                     ("lr_regressor",LinearRegression())])

pipeDT= Pipeline([("std_scalar2",StandardScaler()),
                     ("dt_regressor",DecisionTreeRegressor())])

pipeRF= Pipeline([("std_scalar3",StandardScaler()),
                     ("rf_regressor",RandomForestRegressor())])

pipeXGB= Pipeline([("std_scalar4",StandardScaler()),
                     ("XGB_regressorr",XGBRegressor())])

In [None]:
# List of all the pipelines
pipelines = [pipeLR, pipeDT, pipeRF, pipeXGB]


pipe_dict = {0: "LinearRegression", 1: "DecisionTree", 2: "RandomForest", 3: "XGBRegressor"}


# Fit the pipelines
for pipe in pipelines:
    pipe.fit(train_X, train_y)

In [None]:
cv_results_NMSE = []
for i, model in enumerate(pipelines):
    cv_score = cross_val_score(model,train_X, train_y,scoring="neg_mean_squared_error", cv=10)
    cv_results_NMSE.append(cv_score)
    print("%s: %f " % (pipe_dict[i], cv_score.mean()))
    
# negative root mean square error of linear regression and decision tree regressors are very low. This points to overfitting. 
# On the other hand XGB Regressor's error value appears to be optimal. Let's choose XGB regressor for our main model.

# Model Evaluation

In [None]:
# XGB Model predictions on validation data.
pred = pipeXGB.predict(val_X)

In [None]:
# Model Evaluation with different error metrics
print("R^2:",metrics.r2_score(val_y, pred))
print("Adjusted R^2:",1 - (1-metrics.r2_score(val_y, pred))*(len(val_y)-1)/(len(val_y)-val_X.shape[1]-1))
print("MAE:",metrics.mean_absolute_error(val_y, pred))
print("MSE:",metrics.mean_squared_error(val_y, pred))
print("RMSE:",np.sqrt(metrics.mean_squared_error(val_y, pred)))

# Conclusion

**In this kernel, the data is evaluated by means of their features in order to predict the diamond price. Before predicting the price, exploratory data analysis has been made, outliers eleminated, categorical features encoded and numerical features standartized.To predict the price; Linear Regression Model, Decision Tree Regressor, RandomForrest Regressor and XGB Regressor are compared. Amongst them The XGB Regressor has been the most successful one in order to predict diamond price. **

**Ask me anything if you have questions. If you like the notebook, please vote up :=)**