In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings("ignore")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Dimond Rate Prediction
![](https://news.mit.edu/sites/default/files/styles/news_article__image_gallery/public/images/202010/MIT-Metallic-Diamond-01-Press_0.jpg?itok=386hZmMI)

# Read Dataset

In [None]:
df=pd.read_csv('../input/diamonds/diamonds.csv')

In [None]:
df.head()

# Feature Details
1. price price in US dollars (\$326--\$18,823)

2. carat weight of the diamond (0.2--5.01)

3. cut quality of the cut (Fair, Good, Very Good, Premium, Ideal)

4. color diamond colour, from J (worst) to D (best)

5. clarity a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

6. x length in mm (0--10.74)

7. y width in mm (0--58.9)

8. z depth in mm (0--31.8)

9. depth total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)

10. table width of top of diamond relative to widest point (43--95)

# Data Types

In [None]:
#column name Unnamed:0 is not a valid columns. so, we will drop the column name Unnamed:0
df.drop('Unnamed: 0', axis=1, inplace=True)
df=df[df['x']!=0]
df=df[df['y']!=0]
df=df[df['z']!=0]

In [None]:
df.info()

In [None]:
df.isnull().sum()

# Exploratory Data Analysis

In [None]:
df.describe()

**Target variable is "price"** so let us check the relationship with price with other variables.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
colors=['#003f5c','#2f4b7c','#665191','#a05195','#d45087','#f95d6a','#ff7c43','#ffa600']
sns.set(palette=colors, font='San', style='white', rc={'axes.facecolor':'whitesmoke', 'figure.facecolor':'whitesmoke'})
sns.despine(left=False, right=False)
sns.palplot(colors)
plt.title("Theme for EDA", family='Sherif', size=15, weight=50)

In [None]:
int_cols = df.select_dtypes(exclude='object').columns.to_list()
#print(int_cols)
int_cols.remove('price')
j=0
fig=plt.figure(figsize=(15,10), constrained_layout =True)
plt.suptitle("Regression of the Numeric variables", family='Sherif', size=20, weight='bold')
for i in int_cols:
    ax=plt.subplot(331+j)
    #ax.set_title('Title')
    #print(df[i])
    ax=sns.regplot(data=df, x=i, y='price', color=colors[1], line_kws={'color':'#ffa600'})
    ax.set_title("Price and {} comparision analysis".format(i), family='Sherif')
    for s in ['left','right','top','bottom']:
        ax.spines[s].set_visible(False)
    
    j=j+1

    

**Above chart shows the linear relationship with the Target variable, however, there are outliers**

In [None]:
# let us find the distribution of integer variables
int_cols = df.select_dtypes(exclude='object').columns.to_list()
j=0
fig=plt.figure(figsize=(15,10), constrained_layout =True)
plt.suptitle("Distribution of the Numeric variables", family='Sherif', size=20, weight='bold')
for i in int_cols:
    ax=plt.subplot(331+j)
    #ax.set_title('Title')
    #print(df[i])
    ax=sns.kdeplot(data=df, x=i, color=colors[0], fill=True, edgecolor=colors[-1], alpha=1)
    ax.set_title("Distribution of Numeric variables - {}".format(i), family='Sherif')
    for s in ['left','right','top','bottom']:
        ax.spines[s].set_visible(False)
    
    j=j+1


In [None]:
j=0
fig=plt.figure(figsize=(15,10))
plt.suptitle("Box plot for Numeric variables", family='Sherif', size=20, weight='bold')
for i in int_cols:
    ax=plt.subplot(331+j)
    #ax.set_title('Title')
    #print(df[i])
    ax=sns.boxplot(data=df, x=i,color=colors[0])
    ax.set_title("Box plot for {}".format(i))
    for s in ['left','right','top','bottom']:
        ax.spines[s].set_visible(False)
    j=j+1
ax=plt.subplot(331+j)
ax.text(x=0,y=0.5, s='Obviously there are outliers in the data, we need to examine the outliers to verify if it is extreme value or data error')
for s in ['left','right','top','bottom']:
        ax.spines[s].set_visible(False)

In [None]:
#Correlation with Price column
fig=plt.figure(figsize=(15,8))
sns.heatmap(df.corr(), linewidths=3, annot=True)
plt.title("Correlation matrics", family='Sherif', size=20, weight='bold')

Obviously there are outliers in the data, we need to examine the outliers to verify if it is extreme value or data error

In [None]:
fig=plt.figure(figsize=(15,10), constrained_layout=True)
# let us find the target variable relationship with Categorical variables
plt.suptitle("Categorical feature comparison with Price", family='Sherif', size=20, weight='bold')
cat_cols = df.select_dtypes(include='object').columns.to_list()
ax=fig.subplot_mosaic("""
                        AAB
                        AAC
                        AAD
                        """)
sns.kdeplot(df['price'], fill=True, edgecolor=colors[-1], linewidth=2, color=colors[0], ax=ax['A'], alpha=0.8)
ax['A'].text(x=2000,y=0.00025, s="Target Feature Price is not normally distributed", family='San', fontweight='bold')
ax['A'].text(x=2000,y=0.00023, s="Comparing Price with Categorical feature we can see the Median is more or less same",family='San', fontweight='bold')
sns.boxplot(data=df, x=cat_cols[0],y='price', ax=ax['B'])
sns.boxplot(data=df, x=cat_cols[1],y='price', ax=ax['C'])
sns.boxplot(data=df, x=cat_cols[2],y='price', ax=ax['D'])
for i in 'ABCD':
    for s in ['left','right','top','bottom']:
        ax[i].spines[s].set_visible(False)

Price feature is not normally distribured

In [None]:
cat_cols=df.select_dtypes(include='object').columns.to_list()

fig=plt.figure(figsize=(15,5))
plt.suptitle("Distribution of Categorical variable",family='Sherif', size=20, weight='bold')
ax1=plt.subplot(131)
sns.countplot(data=df, x=cat_cols[0], ax=ax1, linewidth=2, edgecolor=colors[-1])
for s in ['left','right','top','bottom']:
        ax1.spines[s].set_visible(False)
ax2=plt.subplot(132, sharey=ax1)
sns.countplot(data=df, x=cat_cols[1], ax=ax2,linewidth=2, edgecolor=colors[-1])
for s in ['left','right','top','bottom']:
        ax2.spines[s].set_visible(False)
ax3=plt.subplot(133, sharey=ax1)
sns.countplot(data=df, x=cat_cols[2], ax=ax3,linewidth=2, edgecolor=colors[-1])
for s in ['left','right','top','bottom']:
        ax3.spines[s].set_visible(False)

# Statistical Analysis

In [None]:
import statsmodels.api as stats
from statsmodels.stats.anova import anova_lm
from   statsmodels.formula.api import ols

# Hypothesis Testing
**comparing Price value with Categorical feature and check if the mean has significant difference**
1. H0 = there is no significant difference 
2. H1 = there are significant difference

In [None]:
formula='price ~ C(clarity)'
model=ols(formula, df).fit()
print(np.round(anova_lm(model, typ=2),3))
print(model.summary())
if np.round(model.f_pvalue,2)<0.05:
    print("Reject Null Hypothesis and accept the alternate hypothesis")
else:
    print("Accept the Null Hypothesis")

In [None]:
formula='price ~ C(color)'
model=ols(formula, df).fit()
print(np.round(anova_lm(model, typ=2),3))
print(model.summary())
if np.round(model.f_pvalue,2)<0.05:
    print("Reject Null Hypothesis and accept the alternate hypothesis")
else:
    print("Accept the Null Hypothesis")

In [None]:
formula='price ~ C(cut)'
model=ols(formula, df).fit()
print(np.round(anova_lm(model, typ=2),3))
print(model.summary())
if np.round(model.f_pvalue,2)<0.05:
    print("Reject Null Hypothesis and accept the alternate hypothesis")
else:
    print("Accept the Null Hypothesis")

In [None]:
formula='price ~ C(cut)+C(color)+C(clarity)'
model=ols(formula, df).fit()
print(np.round(anova_lm(model, typ=2),3))
print(model.summary())
if np.round(model.f_pvalue,2)<0.05:
    print("Reject Null Hypothesis and accept the alternate hypothesis")
else:
    print("Accept the Null Hypothesis")

# Conclusion on Hypothesis testing
Price value has significant ***(CI=95%)*** impact on the Cut, Clarity & Color of the Dimond

# Outlier handling

In [None]:
import scipy.stats as st
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR=Q3-Q1
df_clean=df[~((df<(Q1-1.5*IQR))|(df>(Q3+1.5*IQR))).any(axis=1)]

In [None]:
int_cols=df_clean.select_dtypes(exclude='object').columns.to_list()
j=0
fig=plt.figure(figsize=(15,10))
plt.suptitle("Box plot for Numeric variables after Outlier removal", family='Sherif', size=20, weight='bold')
for i in int_cols:
    ax=plt.subplot(331+j)
    #ax.set_title('Title')
    #print(df[i])
    ax=sns.boxplot(data=df_clean, x=i,color=colors[0])
    ax.set_title("Box plot for {}".format(i))
    for s in ['left','right','top','bottom']:
        ax.spines[s].set_visible(False)
    j=j+1
ax=plt.subplot(331+j)
ax.text(x=0,y=0.5, s='Outliers are handled with IQR method')
for s in ['left','right','top','bottom']:
        ax.spines[s].set_visible(False)

# One hot encoding for Categorical variables

In [None]:
df1=pd.get_dummies(df_clean, columns=cat_cols, drop_first=True)

In [None]:
df1.head()

# Train test split

In [None]:
X=df1.drop('price', axis=1)
y=df1['price']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

# Standardization

In [None]:
from sklearn.preprocessing import StandardScaler, Normalizer, PolynomialFeatures
#creating Polynomial features as there is some degree of variation in the linear relationship
scaler = PolynomialFeatures(degree=2, interaction_only=True)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Creation

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
pred=model.predict(X_test)
print()
print()


In [None]:
fig=plt.figure(figsize=(15,8))
residual = y_test - pred
plt.suptitle("Comparing y_test and Predicted value", family='Sherif', size=20, weight='bold')
ax=fig.subplot_mosaic("""AA
                          BB
                          CC""")
sns.scatterplot(y_test, residual, ax=ax['A'])
ax['A'].axhline(y=0, ls='--', c=colors[-1], linewidth=3)
sns.kdeplot(residual, ax=ax['B'], fill=True, color=colors[0], edgecolor=colors[-1], linewidth=2)

from sklearn.metrics import mean_squared_error
ax['C'].text(x=0.2,y=0.2,s="Root squared mean error: {}".format(np.round(mean_squared_error(y_test, pred, squared=False),2)), ha='left',family='cursive' ,weight='bold', size=15, style='italic')
ax['C'].text(x=0.2,y=0.4,s="Accuracy of model with Train data: {}".format(np.round(model.score(X_train, y_train),2)), ha='left',family='cursive' ,weight='bold', size=15, style='italic')
ax['C'].text(x=0.2,y=0.6,s="Accuracy of model with Test data: {}".format(np.round(model.score(X_test, y_test),2)), ha='left',family='cursive' ,weight='bold', size=15, style='italic')
ax['C'].text(x=0.2,y=0.8,s="Result:", ha='left',family='cursive' ,weight='bold', size=15, style='italic')

ax['C'].axis('off')

for i in 'ABC':
    for s in ['left','right','top','bottom']:
        ax[i].spines[s].set_visible(False)

# Conclusion
Model can predict the Price of the diamond with 98% accuracy. and with the Root Square mean error of 462.62

**Please review and provide your inputs <br>
Best wishes**