In [1]:
import pandas as pd
df = pd.read_csv('./data/gemstone.csv')


In [2]:
df =df.drop(labels = ['id'], axis = 1)

In [3]:
#pip install scikit-learn

In [4]:

X = df.drop(labels = ['price'], axis = 1)
y = df[['price']]

In [5]:
categorical_cols = X.select_dtypes(include = 'object').columns
numerical_cols = X.select_dtypes(exclude = 'object').columns

In [6]:
cut_categories = ['Premium', 'Very Good', 'Ideal', 'Good', 'Fair']
color_categories = ['F', 'J', 'G', 'E', 'D', 'H', 'I']
clarity_categories = ['VS2', 'SI2', 'VS1', 'SI1', 'IF', 'VVS2', 'VVS1', 'I1']

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

In [8]:
##pipelines

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [9]:
#Numerical pipeline
num_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'median')),
        ('scaler', StandardScaler())
    ]
)

#Categorical pipeline

cat_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'most_frequent')),
        ('encoder', OrdinalEncoder(categories = [cut_categories,color_categories,clarity_categories])),
        ('scaler', StandardScaler())
    ]
)
preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline, numerical_cols),
    ('cat_pipeline', cat_pipeline, categorical_cols)
])


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

In [11]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns = preprocessor.get_feature_names_out())

In [12]:
X_test = pd.DataFrame(preprocessor.transform(X_test), columns = preprocessor.get_feature_names_out())

In [13]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [14]:
regression = LinearRegression()
regression.fit(X_train, y_train)

In [15]:
regression.coef_

array([[ 6538.82211476,  -232.4061404 ,  -177.05602534, -2090.53215704,
         -556.05883268,   -44.19321465,   -10.72128479,  -181.811724  ,
          121.17907723]])

In [16]:
regression.intercept_

array([3976.8787389])

In [17]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    R2_score = r2_score(true, predicted)
    return mae, rmse, R2_score

In [20]:
#train multiple models
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'Elasticnet': ElasticNet()
}
model_list = []
r2_list = []


for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    ## prediction
    y_pred = model.predict(X_test)

    mae, rmse, R2_score = evaluate_model(y_test, y_pred)
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])


    print('Model training performance')
    print('RMSE: ', rmse)
    print('MAE": ', mae)
    print('R2_Score: ', R2_score)

    r2_list.append(R2_score)

    print('='*35)
    print('\n')

LinearRegression


AttributeError: 'LinearRegression' object has no attribute 'values'

In [19]:
model_lists

['LinearRegression', 'Lasso', 'Ridge', 'Elasticnet']