In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
# reading the dataset
df = pd.read_csv('../input/diamonds/diamonds.csv')

In [None]:
# how the data looks
df.head()

In [None]:
df.shape

In [None]:
df.info()

Some columns have strings which we will need to convert to ints

In [None]:
# summary of each numerical attribute
df.describe()

In [None]:
df.isnull().sum()

Unecessary column which needs to be dropped

In [None]:
del df['Unnamed: 0']

In [None]:
# Coorelation analysis
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')

In [None]:
sns.pairplot(df)

In [None]:
# The diamond cut categories
df['cut'].value_counts()

In [None]:
sns.countplot(x='cut', data = df)

Cut vs Price

In [None]:
sns.boxplot('cut', 'price', data = df)

In [None]:
# The diamond color categories
df['color'].value_counts()

In [None]:
sns.countplot(x='color', data = df)

Color vs Price

In [None]:
sns.boxplot('color', 'price', data = df)

In [None]:
# The diamond clarity categories
df['clarity'].value_counts()

In [None]:
sns.countplot(x='clarity', data = df)

Clarity vs Price

In [None]:
sns.violinplot('clarity', 'price', data = df)

In [None]:
df.hist(bins = 50, figsize=(15,10))
plt.show()

The minimum values for x,y and z here are 0 but it is not possible because according to the data description they are the length, width and depth

In [None]:
print("x == 0 : {}".format((df.x==0).sum()))
print("y == 0 : {}".format((df.y==0).sum()))
print("z == 0 : {}".format((df.z==0).sum()))

In [None]:
df.loc[(df['x']==0) | (df['y']==0) | (df['z']==0)]

A zero value in these rows means missing data so we can replace the zeros with nan.

In [None]:
df[['x','y','z']] = df[['x','y','z']].replace(0,np.NaN)

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

Just to visualize no missing values

In [None]:
import missingno as msno
msno.matrix(df)

In [None]:
one_hot_encoder =  pd.get_dummies(df)
df = one_hot_encoder

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
X = df.drop('price', axis = 1)
y = df['price']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

lr_mae = mean_absolute_error(y_test, y_pred)
lr_mse = mean_squared_error(y_test, y_pred)
lr_r2 = r2_score(y_test, y_pred)
print('Linear Regression')
print('Mean Absolute Error:', lr_mae)
print('Mean Squared Error:', lr_mse)
print('R Squared :', lr_r2)

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 20, random_state = 0)
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

In [None]:
rf_mae = mean_absolute_error(y_test, y_pred)
rf_mse = mean_squared_error(y_test, y_pred)
rf_r2 = r2_score(y_test, y_pred)
print('Random Forest Regressor')
print('Mean Absolute Error:', rf_mae)
print('Mean Squared Error:', rf_mse)
print('R Squared :', rf_r2)

If you find this notebook useful, **PLEASE UPVOTE!!**