# *Diamonds Price Prediction*

![](https://www.anglodiamond.com/media/wysiwyg/Anglo-DiamondAnatomy_03.jpg)

### attribute information :
*price* price in US dollars (\$326--\$18,823)

*carat* weight of the diamond (0.2--5.01)

*cut* quality of the cut (Fair, Good, Very Good, Premium, Ideal)

*color* diamond colour, from J (worst) to D (best)

*clarity* a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

*x* length in mm (0--10.74)

*y* width in mm (0--58.9)

*z* depth in mm (0--31.8)

*depth* total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)

*table* width of top of diamond relative to widest point (43--95)

In [None]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('../input/diamonds/diamonds.csv',index_col=0)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.describe(include='O')

In [None]:
data.corr()

In [None]:
data['cut'].value_counts()

In [None]:
data['color'].value_counts()

In [None]:
data['clarity'].value_counts()

## *visualizing trends between attributes*

In [None]:
sb.violinplot(y='price',x='clarity',data=data)

In [None]:
sb.violinplot(y='price',x='cut',data=data)

In [None]:
sb.displot(data=data,x='price',hue='cut',kind='kde',height=6,multiple="fill",clip=(0,None),palette="ch:rot=-.25,hue=1,light=.75")

In [None]:
features=['table','depth']
sb.lineplot(data=data[features],palette="tab10")

In [None]:
sb.displot(data=data,x="carat", hue="cut",kind="kde", height=6,multiple="fill", clip=(0, None),palette="ch:rot=-.25,hue=1,light=.75",)

In [None]:
sb.pairplot(data=data,hue='cut')
plt.show()

In [None]:
# converting categorical variables to dummy variables
data1 = pd.get_dummies(data)

In [None]:
data1.info()

In [None]:
data1.head()

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [None]:
x = data1.drop(['price'],axis=1)

In [None]:
y=data1['price']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=4,test_size=0.3)

In [None]:
lin = LinearRegression()

In [None]:
lin.fit(x_train,y_train)

In [None]:
lin_pred = lin.predict(x_test)

### plotting predicted vs actual price

In [None]:
sb.jointplot(x=y_test,y=lin_pred,kind='reg',line_kws={"color": "red"})
plt.title('actual vs predicted output')
plt.show()

In [None]:
print('Linear regression accuracy: ')
crossl = cross_val_score(estimator = lin, X = x_train, y = y_train, cv = 4,verbose = 0)
msel = mean_squared_error(y_test, lin_pred)
rmsel = mean_squared_error(y_test, lin_pred)**0.5
mael = mean_absolute_error(y_test, lin_pred)
r2l = r2_score(y_test, lin_pred)

print('Cross val     : ',crossl)
print('MAE           : ', mael)
print('MSE (RMSE)    : ', msel,' (',rmsel,')')
print('R2 score      : ', r2l)

### This model predicts the diamonds price with 92% accuracy