In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("/opt/conda/envs/notebooks/data/gemstone.csv")

In [2]:
data.drop(labels="id",axis=1)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453
...,...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67,1130
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47,2874
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62,3036
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81,681


In [3]:
X = data.drop(labels="price", axis=1)
Y = data["price"]

In [5]:
categorical_columns = X.select_dtypes(include="object").columns
numerical_columns = X.select_dtypes(exclude="object").columns

In [8]:
print(X)
print(Y)
print(categorical_columns)
print(numerical_columns)

            id  carat        cut color clarity  depth  table     x     y     z
0            0   1.52    Premium     F     VS2   62.2   58.0  7.27  7.33  4.55
1            1   2.03  Very Good     J     SI2   62.0   58.0  8.06  8.12  5.05
2            2   0.70      Ideal     G     VS1   61.2   57.0  5.69  5.73  3.50
3            3   0.32      Ideal     G     VS1   61.6   56.0  4.38  4.41  2.71
4            4   1.70    Premium     G     VS2   62.6   59.0  7.65  7.61  4.77
...        ...    ...        ...   ...     ...    ...    ...   ...   ...   ...
193568  193568   0.31      Ideal     D    VVS2   61.1   56.0  4.35  4.39  2.67
193569  193569   0.70    Premium     G    VVS2   60.3   58.0  5.75  5.77  3.47
193570  193570   0.73  Very Good     F     SI1   63.1   57.0  5.72  5.75  3.62
193571  193571   0.34  Very Good     D     SI1   62.9   55.0  4.45  4.49  2.81
193572  193572   0.71       Good     E     SI2   60.8   64.0  5.73  5.71  3.48

[193573 rows x 10 columns]
0         13619
1       

In [6]:
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [7]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [9]:
num_pipeline=Pipeline(
    
    
    steps=[
        
        ('imputer',SimpleImputer()),
        ('scaler', StandardScaler())
    ]
    
    
)
    

In [10]:
cat_pipeline=Pipeline(
    
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories]))
    ]
    
)

In [11]:
preprocessor=ColumnTransformer(
    [
        
        ('num_pipeline',num_pipeline,numerical_columns),
        ('cat_pipeline',cat_pipeline,categorical_columns)
    ]
)

In [20]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [13]:
preprocessor.fit_transform(X_train)

array([[ 1.27491908, -0.97543926, -0.84960654, ...,  4.        ,
         5.        ,  5.        ],
       [-1.10290693,  0.2351953 ,  1.83363716, ...,  1.        ,
         1.        ,  2.        ],
       [-0.99761317,  0.49461699,  0.81585507, ...,  3.        ,
         3.        ,  4.        ],
       ...,
       [ 0.03428   ,  0.45138004,  1.55606023, ...,  1.        ,
         3.        ,  2.        ],
       [-0.87327749,  0.66756478, -1.77486298, ...,  4.        ,
         3.        ,  4.        ],
       [ 1.3030439 ,  0.25681377,  0.81585507, ...,  4.        ,
         3.        ,  2.        ]])

In [14]:
preprocessor.transform(X_test)

array([[-0.47300385, -0.56468825, -0.94213218, ...,  3.        ,
         1.        ,  3.        ],
       [-0.57300521, -0.17555571,  1.00090636, ...,  2.        ,
         4.        ,  2.        ],
       [ 1.58109167, -1.06191316,  0.2607012 , ...,  4.        ,
         4.        ,  7.        ],
       ...,
       [-0.93195878, -0.19717419, -3.34779894, ...,  3.        ,
         6.        ,  3.        ],
       [-1.198492  , -0.82410994, -0.20192702, ...,  4.        ,
         3.        ,  2.        ],
       [-0.61614902,  2.61322747, -0.75708089, ...,  3.        ,
         6.        ,  3.        ]])

fit() => fit the data ----  Learn and estimate the parameters of the transformation
transform() => transform the data ---- Apply the learned transformation to new data. This is useful when we want to transform new data using the same scaling or encoding applied to the training data. for example test data.
fit_transform() => do both at a time.


In [None]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [15]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [16]:
trained_model_list=[]
model_list=[]
r2_list=[]
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}


In [17]:
for i in range(len(list(models))):
    model=list(models.values())[i]
    print(model)

LinearRegression()
Lasso()
Ridge()
ElasticNet()


In [18]:
print(models.keys())
print(models.values())

dict_keys(['LinearRegression', 'Lasso', 'Ridge', 'Elasticnet'])
dict_values([LinearRegression(), Lasso(), Ridge(), ElasticNet()])


In [None]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [21]:
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    #this is a validation(test) score
    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

ValueError: could not convert string to float: 'Ideal'