In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import warnings
warnings.filterwarnings('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('../input/gemstone-price-prediction/cubic_zirconia.csv')
df.shape

In [None]:
df.head()

In [None]:
df=df.iloc[:, 1:]

In [None]:
#show the information about dataset
df_dtype=df.dtypes
df_missing=df.isna().mean().round(4)*100
df_desc = df.describe(include='all')
dtls_1=pd.DataFrame(df_missing, columns=['missing'])
dtls_2=pd.DataFrame(df_dtype, columns=['dtype'])
dtls_3=pd.DataFrame(df_desc)
dtls_3=dtls_3.T
dtls=pd.concat([dtls_1, dtls_2, dtls_3], axis=1)
dtls.sort_values(by='missing', ascending=False)

### Missing value Treatment

In [None]:
df['depth'].fillna(df['depth'].mean(), inplace=True)

### Let's drop duplicates

In [None]:
print(f'before removing duplicates{df.shape}')
df.drop_duplicates(inplace=True)
print(f'after removing duplicates{df.shape}')

### Univariate Analysis

In [None]:
plt.rcParams['figure.figsize']=23,5
plt.subplot(131)
plt.pie(df['cut'].value_counts().values,
        labels=df['cut'].value_counts().index,
        startangle=90,
        colors=['gold', 'lightgreen', 'red', 'lightblue', 'pink'],
        explode=[0.05,0.05,0.05,0.05,0.2],
        shadow=True, autopct='%1.2f%%')
plt.subplot(132)
sns.countplot(df['color'], palette='ocean', order=df['color'].value_counts().index)

plt.subplot(133)
df["clarity"].value_counts().plot.bar()
plt.xlabel('clarity types')
plt.ylabel('count')


plt.suptitle('Distribution of Cut, Color and Clarity of Gems')
plt.show()

In [None]:
plt.rcParams['figure.figsize']=20,5
plt.subplot(131)
sns.distplot(df['depth'], color='orange')

plt.subplot(132)
sns.distplot(df['carat'], color='red')

plt.subplot(133)
sns.distplot(df['price'], color='black')

plt.suptitle('Distribution of depth, carat, and Price')
plt.show()

In [None]:
plt.rcParams['figure.figsize']=20,5
plt.subplot(131)
sns.boxplot(df['x'], color='orange')

plt.subplot(132)
sns.boxplot(df['y'], color='red')

plt.subplot(133)
sns.boxplot(df['z'], color='black')

plt.suptitle('Distribution of Length of the cubic zirconia (x,y,z)')
plt.show()

In [None]:
import plotly.express as px

### Bivariate Analysis

In [None]:
sns.scatterplot(df['carat'], df['price'])
plt.title('carat vs price')
plt.show()

If carat increased price is also increases

In [None]:
plt.rcParams['figure.figsize']=20,5
plt.subplot(131)
sns.scatterplot(df['x'], df['price'], color='orange')

plt.subplot(132)
sns.scatterplot(df['y'], df['price'], color='orange')

plt.subplot(133)
sns.scatterplot(df['z'], df['price'], color='orange')

plt.suptitle('price and length of gem')
plt.show()

In [None]:
#we have few zeros in gem size and length, either we can drop or replace with proper value.
#I'll drop coz dropping 9 from 27000 will not gonna affect much
df=df[~((df['x']==0)|(df['y']==0)|(df['z']==0))]

In [None]:
#We have outliers in y and z let's cap them with most frequent value

df['y'].values[df['y'].values>50]=df['y'].mode()[0]
df['z'].values[df['z'].values>30]=df['z'].mode()[0]

In [None]:
plt.subplot(131)
sns.boxplot(df['cut'], df['price'])

plt.subplot(132)
sns.boxplot(df['color'], df['price'])

plt.subplot(133)
sns.boxplot(df['clarity'], df['price'])

In [None]:
df[['price', 'cut']].groupby(['cut']).agg(['min', 'mean', 'max']).style.background_gradient(cmap='Wistia')

In [None]:
df[['price', 'color']].groupby(['color']).agg(['min', 'mean', 'max']).style.background_gradient(cmap='Wistia')

In [None]:
df[['price', 'clarity']].groupby(['clarity']).agg(['min', 'mean', 'max']).style.background_gradient(cmap='Wistia')

### Data Preprocessing
* as per description let's encode 'cut' 

In [None]:
df['cut']=df['cut'].replace(('Fair', 'Good', 'Very Good', 'Premium', 'Ideal'), (1,2,3,4,5))


In [None]:
colors_dummies=pd.get_dummies(df['color'], drop_first=True)
clarity_dummies=pd.get_dummies(df['clarity'], drop_first=True)

In [None]:
dummies=pd.concat([colors_dummies, clarity_dummies], axis=1)

In [None]:
df=pd.concat([df, dummies], axis=1)

In [None]:
df.drop(['color', 'clarity'], axis=1, inplace=True)

In [None]:
sns.heatmap(df.corr(), annot=True)

In [None]:
df.drop(['x', 'y', 'z'], axis=1, inplace=True)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [None]:
x=df.drop('price', axis=1)
y=df['price']

In [None]:
X_train, X_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=10)

In [None]:
model=RandomForestRegressor()

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)