In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


##visual imports
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

##Missing data
from sklearn.impute import SimpleImputer

##Categorical Encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

##Feature Scaling
from sklearn.preprocessing import StandardScaler

##Splitting data
from sklearn.model_selection import train_test_split

#Splitting Data
from sklearn.model_selection import train_test_split

# Feature Scaling
from sklearn.preprocessing import StandardScaler


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Diamond are forever!

Hello and welcome to my kaggle workbook!

In this sheet I will look to analyse the data of diamonds in America and then produce a model that can predict the price of a diamond. 

To start I have detailed the features below

**Feature details**

* Price: price in US dollars (within range of 326 - 18,823)

* Carat: weight of the diamond (0.2-5.01)

* Cut: quality of the cut (in ascending order from Fair, Good, Very Good, Premium, Ideal)

* Color: diamond colour, from J (worst) to D (best)

* Clarity: a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

* X: length in mm (0--10.74)

* Y: width in mm (0--58.9)

* Z: depth in mm (0--31.8)

* Depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)

* Table: width of top of diamond relative to widest point (43--95)

In [None]:
diamond_data = pd.read_csv("/kaggle/input/diamonds/diamonds.csv")

In [None]:
diamond_data.head()

In [None]:
diamond_data.info()

# NO NULL VALUES!

In [None]:
diamond_data.describe()

# Findings

X, Y and Z all contain 0 values, a 0 value suggests that the diamond is dimensionless - which cannot be correct. 

This will require some later investigation, if there are few 0 values in the data set then they can be dropped.  

Large positive outliers in both the carat and price sections, when compared with the mean. 

In [None]:
print("Cut Breakdown\n")
print(diamond_data["cut"].value_counts())
print("_"*20)
print("Color Breakdown\n")
print(diamond_data["color"].value_counts())
print("_"*20)
print("Clarity Breakdown\n")
print(diamond_data["clarity"].value_counts())

# Analysis of Ordinal Data

**Cut Breakdown**
* Skew towards the higher quality cut of diamond, not evenly distributed within the category

**Color Breakdown**
* More spread around the mid range the Color category, higher and lower quality seeming to be outliers compared to rest of data

**Clarity Breakdown**
* Top 4 are split relatively evenly between the higher and lower quality clarity. The highest and lowest are outliers within the data. Looks relatively evenly distributed between the clarity categories

# Quick Look

No null values, however there seem to be 0 values within the dimension categories that will need to be looked over. [](http://)

"Unnamed" represents the index for the sheet, this can be dropped

Cut, color and clarity are all ordinal features, as per the detail above. These can be converted into numerical values so that they can be used in the model 


In [None]:
diamond_data.drop(["Unnamed: 0"], axis = 1, inplace = True)

# Exploratory Data Analysis

Now that I taken a quick overview of what is in the data I will look at graphs related to the data to see what trends can be established


In [None]:
sns.pairplot(diamond_data)

In [None]:
plt.figure(figsize=(12, 7))
correl = diamond_data.corr()
sns.heatmap(correl, annot = True)

# Correlation Findings

+ The data is clearly affected by outlier within all categories. These can be removed to giver a clearer picture of the correlation

* x,y and z are heavily correlated, this is no surprise as the dimensions of the diamond should be correlated. 
  These can be consolidated into a single variable, volume, to avoid multicollinearity 

+ Price is not normally distributed and does not hold a linear relationship with all variables, this needs to be log transformed. 

+ Carat is highly correlated to the price of the diamond

+ depth and table seem to bare little relationship to other features, these could both be dropped

# Dimension Cleaning

Here, we can begin to change the x,y and z data to make the categories useable in our models. 

To start off with, we can look to drop the rows that contain x, y and z data that has 0 values in, followed by removing large outliers.

Following this we can build a new feature, volume, which will be a combination of mentioned features. 



In [None]:
print("0 value x: {}".format(diamond_data['x'].isin([0]).sum()))
print("0 value y: {}".format(diamond_data['y'].isin([0]).sum()))
print("0 value z: {}".format(diamond_data['z'].isin([0]).sum()))

In [None]:
diamond_data[["x","y","z"]] = diamond_data[["x","y","z"]].replace(0,np.NaN)
diamond_data.isnull().sum()

In [None]:
diamond_data.dropna(inplace=True)
diamond_data.shape

Nulls have been removed, resulting in a drop of 20 entries from the dataset.

In [None]:
diamond_data.describe()

# Dealing with Outliers

Here we will look at the outliers within the dimension data to see if anything can be removed. After reviewing the below distribution plots and the below data we can see that there are a few outliers within each dimension:

1. X - Mean = 5.73, STD = 1.12, Min = 3.73, Max = 10.74
1. Y - Mean = 5.53, STD = 1.14, Min = 3.68, Max = 58.90
1. Z - Mean = 3.53, STD = 0.07, Min = 1.07, Max = 31.80

In [None]:
plt.title('X Distribution Plot')
sns.distplot(diamond_data["x"], bins = 50)

In [None]:
plt.title('Y Distribution Plot')
sns.distplot(diamond_data["y"], bins = 50)

In [None]:
plt.title('Z Distribution Plot')
sns.distplot(diamond_data["z"], bins = 50)

Whilst these outliers may be correct data points they don't align with the full dataset, so including they could effect our end results. 

The x, y and z are not normally distribution, however it is clear from above the the features centre around the mean with a small amount of larger outliers. 

To remove the outliers I will exclude values over 25 in Y and Z and 9.5 over X

In [None]:
x_rep =diamond_data['x'] < 9.5
y_rep =diamond_data['y'] < 20
z_rep =diamond_data['z'] < 20
diamond_data['x'].where(x_rep,np.NaN, inplace = True)
diamond_data['y'].where(y_rep,np.NaN, inplace = True)
diamond_data['z'].where(z_rep,np.NaN, inplace = True)
diamond_data.isnull().sum()

In [None]:
diamond_data.dropna(inplace=True)
diamond_data.shape

In [None]:
sns.pairplot(diamond_data)

15 rows removed and large outliers removed. Following the removal of the outliers we can see a clearer correlation between the dimension in the pairplot above. Now it is time to combine the features!!

In [None]:
diamond_data['vol'] = diamond_data['x']*diamond_data['y']*diamond_data['z']
diamond_data.head()

In [None]:
diamond_data.drop(['x','y','z'],axis =1, inplace=True)

In [None]:
sns.pairplot(diamond_data)

In [None]:
print('The mean volume in the set is: {:.2f}'.format(diamond_data['vol'].mean()))
print('The maximum volume in the set is: {:.2f}'.format(diamond_data['vol'].max()))

In [None]:
plt.figure(figsize=(12, 7))
correl = diamond_data.corr()
sns.heatmap(correl, annot = True)

In [None]:
from scipy.stats import kurtosis
from scipy.stats import skew
print('excess kurtosis of normal distribution (should be 0): {}'.format(skew(diamond_data['price'])))
print('skewness of normal distribution (should be 0): {}'.format(kurtosis(diamond_data['price'])))

# DATA EXPLORATORION

In [None]:
sns.boxplot(x = "price", y = "cut", data = diamond_data)

In [None]:
sns.boxplot(x = "price", y = "color", data = diamond_data)

In [None]:
sns.boxplot(x = "price", y = "clarity", data = diamond_data)

In [None]:
sns.jointplot(x = "price", y = "carat", data = diamond_data)

# Findings


When comparing the VVS2, VVS1, IF (top level clarity) and ideal, premium and very good (top level cut), there was very little deviation from the main distribution of price 

This can also be inferred by observing the box plot for each ordinal variable. It can be seen that there is not a high variance of the mean in each variable, with all the means being affected by large outliers

# Non Normality and Homoscedasticity

As we can see from the below, we can see that the price is skewed and is not normallz distributed. When compared with some other features we can also see that the data is not fully homoscedastic. To fix this we will look to log transform the price feature. 

In [None]:
plt.figure(figsize=(12, 7))
sns.distplot(diamond_data["price"], bins = 50)

In [None]:
diamond_data["price"] = diamond_data["price"].apply(np.log)
diamond_data["carat"] = diamond_data["carat"].apply(np.log)
diamond_data["vol"] = diamond_data["vol"].apply(np.log)

In [None]:
plt.figure(figsize=(12, 7))
plt.title('Price Distribution')
sns.distplot(diamond_data["price"])
print('excess kurtosis of normal distribution (should be 0): {}'.format(skew(diamond_data['price'])))
print('skewness of normal distribution (should be 0): {}'.format(kurtosis(diamond_data['price'])))

In [None]:
plt.figure(figsize=(12, 7))
plt.title('Volume Distribution')
sns.distplot(diamond_data["vol"])
print('excess kurtosis of normal distribution (should be 0): {}'.format(skew(diamond_data['vol'])))
print('skewness of normal distribution (should be 0): {}'.format(kurtosis(diamond_data['vol'])))

In [None]:
plt.figure(figsize=(12, 7))
plt.title('Carat Distribution')
sns.distplot(diamond_data["carat"])
print('excess kurtosis of normal distribution (should be 0): {}'.format(skew(diamond_data['carat'])))
print('skewness of normal distribution (should be 0): {}'.format(kurtosis(diamond_data['carat'])))

In [None]:
sns.pairplot(diamond_data)

# Converting Ordinal Features!

Here we can conver the features for clarity, cut and color into numerical features so they can be used to train our models
*     Cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)   
*     Color: diamond colour, from J, I, H, G ,F , E, D (best)
*     Clarity: a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

In [None]:
cut_mapping = {"Fair": 1, "Good": 2, "Very Good": 3, "Premium": 4, "Ideal": 5}
color_mapping = {"J": 1, "I": 2, "H": 3, "G": 4, "F": 5, "E":6, "D":7}
clarity_mapping = {"I1": 1, "SI2": 2, "SI1": 3, "VS2": 4, "VS1": 5, "VVS2":6,"VVS1":7,"IF":8}

diamond_data['cut'] = diamond_data['cut'].map(cut_mapping)
diamond_data['color'] = diamond_data['color'].map(color_mapping)
diamond_data['clarity'] = diamond_data['clarity'].map(clarity_mapping)

diamond_data.head()

# Building the Models

Now that we have converted all the data into numeric values and also removed any outliers we can start to build the models

In [None]:
# Simple & Multi Linear Regression
from sklearn.linear_model import LinearRegression, Lasso, Ridge, SGDRegressor

# Polynomial Regression 
from sklearn.preprocessing import PolynomialFeatures

#Support Vector Regression
from sklearn.svm import SVR

#CART Regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

#K-Nearest Neighbours
from sklearn.neighbors import KNeighborsRegressor

#XG Boost
from xgboost import XGBRegressor


In [None]:
X_train, X_test, y_train, y_test = train_test_split(diamond_data.drop('price',axis=1), 
                                                    diamond_data['price'], test_size=0.25, 
                                                    random_state=101)

In [None]:
from sklearn.metrics import r2_score

LR = LinearRegression()
LR.fit(X_train,y_train)
y_pred = LR.predict(X_test)

R2 = r2_score(y_test, y_pred)

n=diamond_data.shape[0]
p=diamond_data.shape[1] - 1

adj_rsquared = 1 - (1 - R2) * ((n - 1)/(n-p-1))


from sklearn import metrics

print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("r2: ", r2_score(y_test, y_pred))
print("Adjusted r2:", adj_rsquared )



# Linear Regression

Using a simple Linear Regression model we get an R2 value of around 98 with an adjusted R2 of around 98. Which is not bad!

In [None]:
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator = LR, X = X_train, y = y_train, cv = 10)
R2accuracies = cross_val_score(estimator = LR, X = X_train, y = y_train, cv = 10, scoring = 'r2')
MSEaccuracies = cross_val_score(estimator = LR, X = X_train, y = y_train, cv = 10, scoring = 'neg_mean_squared_error')

n=diamond_data.shape[0]
p=diamond_data.shape[1] - 1

MSE = MSEaccuracies.mean()*-1
R2 = R2accuracies.mean()*100
adj_rsquared = 1 - (1 - R2) * ((n - 1)/(n-p-1))

print("MSE: {:.2f}".format(MSE))
print("RMSE: {:.2f}".format((MSE**0.5)))
print("R2: {:.2f}".format((R2)))
print("Adjusted R2: {:.2f}".format(adj_rsquared))

# Cross Validation

Using a cross validation model to run the model 10 times yields a similar result to the initial model that was run. 



In [None]:
from sklearn.model_selection import ShuffleSplit
cv_split = ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 )

In [None]:
MLA = [LinearRegression(), DecisionTreeRegressor(),KNeighborsRegressor(), XGBRegressor(), RandomForestRegressor()]
MLA_columns = ["MLA Name","Mean Price","MAE","MSE", "RMSE", "R2","Adjusted R2"]
MLA_compare = pd.DataFrame(columns = MLA_columns)
n=diamond_data.shape[0]
p=diamond_data.shape[1] - 1

row_index = 0
for alg in MLA:
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    
    alg.fit(X_train, y_train)
    pred = alg.predict(X_test)
    
    
    R2accuracies = cross_val_score(estimator = alg, X = X_train, y = y_train, cv = cv_split)
    MSEaccuracies = cross_val_score(estimator = alg, X = X_train, y = y_train, cv = cv_split, scoring = 'neg_mean_squared_error')
    MAEaccuracies = cross_val_score(estimator = alg, X = X_train, y = y_train, cv = cv_split, scoring = 'neg_mean_absolute_error')
    MSE = MSEaccuracies.mean()*-1
    R2 = R2accuracies.mean()*100
    MAE = MAEaccuracies.mean()*-1
    adj_rsquared = 1 - (1 - R2) * ((n - 1)/(n-p-1))
    
    MLA_compare.loc[row_index, "Mean Price"] = pred.mean()
    MLA_compare.loc[row_index, "MAE"] = MAE
    MLA_compare.loc[row_index, "MSE"] = int(MSE)
    MLA_compare.loc[row_index, "RMSE"] = MSE**0.5
    MLA_compare.loc[row_index, "R2"] = R2
    MLA_compare.loc[row_index, "Adjusted R2"] = adj_rsquared

    row_index +=1
                                                       

MLA_compare.sort_values(by = ["R2"], ascending = False, inplace = True)
MLA_compare

In [None]:
plt.title("MLA Accuracy Rank")
sns.barplot(x = "R2", y = "MLA Name", data = MLA_compare)

# Findings 

From this we can see that the XGB and RandomForestRegressor yield the highest R2 value, which can infer that they are the most accurate. 

If you liked this sheet then please leave a comment. 

I am new to the world of data science, with all my knowledge being self taught so I might have missunderstood some concepts or incorrectly applied some of logic. If you notice anything that is incorrect or something that looks a little wrong then please leave a comment and let me know. 

Thanks!
