# Diamomds DataSet Analysis

### Importing the modules

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### loading Data into DataFrame

In [None]:
df=pd.read_csv("../input/diamonds/diamonds.csv")
df.head()

In [None]:
#droping the Unnamed: 0 column
df.drop('Unnamed: 0',axis=1,inplace=True)
df.head(2)

## Analysing Data

### Data type of each attribute

In [None]:
df.dtypes

### counting number of NAN values in each attribute

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
df.describe()

#### we can see that certain value in x,y,z (dimensions) are zero that is not possible so we will drop those rows having either x ,y or z value equal to zero

In [None]:
df.drop(df[df['x']==0].index,inplace=True)
df.drop(df[df['y']==0].index,inplace=True)
df.drop(df[df['z']==0].index,inplace=True)
df.shape

In [None]:
df.columns

# Visualization of features again price

## 1. Carat

In [None]:
sns.distplot(df.carat)

In [None]:
sns.regplot(x='carat',y='price',data=df)
plt.title("Carat V/s Price")

The graph shows that diamond with carat value between 1 to 3 their price lies between 10000 to 20000 and carat value greater then 3 doesn't show much effect on price

## 2. Cut

In [None]:
df.cut.value_counts().plot(kind='bar')
plt.xlabel('Cut',labelpad=20)
plt.ylabel('Value Counts',labelpad=20)
plt.title("Counts of categories of Cuts",pad=20)

the above bar graph shows that ideal type of cut are most frequent in diamonds 

In [None]:
sns.boxplot(x='cut',y='price',data=df)
plt.title("Cut V/s Price")

the above box plot shows that cut doesn't have effect on price of the diamond

## 3. Color

In [None]:
df.color.value_counts().plot(kind='bar')
plt.xlabel('Color',labelpad=20)
plt.ylabel('Value Counts',labelpad=20)
plt.title("Counts of categories of Color",pad=20)

the bar plot shows that color categorie of type G is more frequent among diamonds

In [None]:
sns.boxplot(x='color',y='price',data=df)
plt.title("Color V/s Price")

the box plot shows that color doesn't much effect the price of diamond

## 4. Clarity

In [None]:
df.clarity.value_counts().plot(kind='bar')
plt.xlabel('Clarity',labelpad=20)
plt.ylabel('Value Counts',labelpad=20)
plt.title("Counts of categories of Clarity",pad=20)

the above bar plot shows that SI1 and VS2 clarity in diamonds are most common

In [None]:
sns.boxplot(x='clarity',y='price',data=df)
plt.title("Clarity V/s Price")

clarity attribute also don't make much effect on price of diamond

## 5. Depth

In [None]:
sns.distplot(df.depth)

almost every diamond depth lies between 60 to 65

In [None]:
sns.relplot(x='depth',y='price',data=df)

the above graph shows clearly that depth range between 55 to 70 covers all price range

## 6. Table

In [None]:
sns.distplot(df.table,bins=20)

In [None]:
sns.relplot(x='table',y='price',data=df)
plt.title('Table V/s Price')

Graph represents that table attribute range between 50 to 70 includes all range of price

## 7. x,y,z

In [None]:
sns.distplot(df.x)

In [None]:
sns.regplot(x='x',y='price',data=df)

In [None]:
sns.distplot(df.y)

In [None]:
sns.regplot(x='y',y='price',data=df)

In [None]:
sns.distplot(df.z)

In [None]:
sns.regplot(x='z',y='price',data=df)

# Feature Engineering

In [None]:
df_new=df.copy()
df_new.head(2)

In [None]:
df_new.shape

In [None]:
df_new.drop_duplicates(inplace=True,ignore_index=True,keep='first')
df_new.shape

In [None]:
df_new.tail()

## Applying Dummy variable

In [None]:
df_new=pd.concat([df_new,pd.get_dummies(df_new[['cut','color','clarity']])],axis=1)
df_new.head(2)

In [None]:
df_new.drop(['cut','color','clarity'],axis=1,inplace=True)
df_new.head(2)

## Applying MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scale=MinMaxScaler()

In [None]:
scaled_df=scale.fit_transform(df_new[['carat','depth','table','price','x','y','z']])
scaled_df=pd.DataFrame(scaled_df,columns=['carat','depth','table','price','x','y','z'])
scaled_df.tail()

In [None]:
df_new.drop(['carat','depth','table','price','x','y','z'],axis=1,inplace=True)
df_new.shape

In [None]:
df_new[['carat','depth','table','price','x','y','z']]=scaled_df
df_new.shape

## Multiple Regression

In [None]:
corr=df_new.corr()
corr=[abs(corr[col]['price']) for col in df_new.columns]
corr_list=list(zip(corr,df_new.columns))
corr_list.sort(key=lambda x:x[0],reverse=True)
corr_list

In [None]:
corrs,labels=zip(*corr_list)
plt.figure(figsize=(15,5))
plt.bar(np.arange(len(corr_list)),corrs)
plt.xlabel("Attributes",labelpad=20)
plt.ylabel("Correlation with Price",labelpad=20)
plt.xticks(np.arange(len(corr_list)),labels,rotation=90)
plt.title("Correlation of Attributes and Price",pad=20)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
lm=LinearRegression()

In [None]:
x=df_new[['carat','x','y','z']]
y=df_new.price

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [None]:
lm.fit(x_train,y_train)

In [None]:
yhat=lm.predict(x_test)

## Mean Squared Error

In [None]:
mean_error=mean_squared_error(yhat,y_test)
mean_error

## R2 score

In [None]:
r2=lm.score(x_test,y_test)
r2

## Cross validation score

In [None]:
cross_val=cross_val_score(lm,x_test,y_test)
cross_val.mean()

In [None]:
plt.figure(figsize=(10,8))
sns.distplot(y_test,hist=False,label="Original Price")
sns.distplot(yhat,hist=False,label="Predicted price")
plt.title("Original vs Predicted Price (Multiple Linear Regression)")
plt.xlabel("Price")