Context
This classic dataset contains the prices and other attributes of almost 54,000 diamonds. It's a great dataset for beginners learning to work with data analysis and visualization.

Content
price price in US dollars (\$326--\$18,823)

carat weight of the diamond (0.2--5.01)

cut quality of the cut (Fair, Good, Very Good, Premium, Ideal)

color diamond colour, from J (worst) to D (best)

clarity a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

x length in mm (0--10.74)

y width in mm (0--58.9)

z depth in mm (0--31.8)

depth total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)

table width of top of diamond relative to widest point (43--95)

In [3]:
%pwd

'c:\\Users\\hp\\Desktop\\Python Programs\\New folder\\Python Object and Data Structure Basics\\ML Projects with Live class\\Diamond_Price_Prediction\\Diamond_Prediction\\notebook'

In [6]:
!pip install pandas



In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

AttributeError: partially initialized module 'pandas' has no attribute 'core' (most likely due to a circular import)

In [None]:
df = pd.read_csv("gemstone.csv")
df.head(5)

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
#Seperating Numerical and categorical data
numerical = df.columns[df.dtypes != 'object']

In [None]:
numerical

In [None]:
category = df.columns[df.dtypes == 'object']
category
                      

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df['cut'].value_counts()

In [None]:
df['color'].value_counts()

In [None]:
df['clarity'].value_counts()

In [None]:
plt.figure(figsize = (8,6))
x = 0
for i in numerical:
    sns.histplot(data=df, x=i, kde= True)
    plt.show()

In [None]:
x = 0
plt.figure(figsize= (8,6))
for i in category:
    sns.histplot(data = df, x = i)
    plt.show()


In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df[numerical].corr(), annot = True, cmap = 'rainbow', fmt = '0.2',cbar = True)

In [None]:
#Price Frequency
plt.figure(figsize= (12,5))
sns.histplot(data= df, x= "price", kde= True)
plt.title("Target Distribution")
plt.show()

In [None]:
#df['cut'].value_counts()

cut_map={"Fair":1,"Good":2,"Very Good":3,"Premium":4,"Ideal":5}

In [None]:
#df['clarity'].unique()
clarity_map={"I1":1,"SI2":2 ,"SI1":3 ,"VS2":4 , "VS1":5 , "VVS2":6 , "VVS1":7 ,"IF":8}

In [None]:
color_map={"D":1,"E":2,"F":3,"G":4,"H":5,"I":6,"J":7}

In [None]:
df['cut']=df['cut'].map(cut_map)
df['clarity']=df['clarity'].map(clarity_map)
df['color']=df['color'].map(color_map)

In [None]:
df.shape

In [None]:
df.head(5)

In [None]:
df.columns

In [None]:
df.drop(['clarity','color','cut'],axis=1,inplace=True)

In [None]:
df.shape

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop(labels = ['price'], axis=1)
Y = df[['price']]

In [None]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [None]:
categorical_cols

In [None]:
numerical_cols

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
num_pipeline=Pipeline(
steps=[('imputer',SimpleImputer(strategy='median')),
       ('scaler',StandardScaler())]
)

In [None]:
preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols)])
preprocessor

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [None]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [None]:
X_train.head()

In [None]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [None]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [None]:
regression.coef_

In [None]:
regression.intercept_

In [None]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [None]:
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

In [None]:
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)
    print((y_pred.mean()))

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

In [None]:
y_pred

In [None]:
model_list