In [4]:
import pandas as pd

In [5]:
#Data Ingestion
df = pd.read_csv('data/gemstone.csv')
df

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,1,0.30,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,2,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.70,984
2,3,0.90,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
3,4,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.80,2.96,1082
4,5,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779
...,...,...,...,...,...,...,...,...,...,...,...
26962,26963,1.11,Premium,G,SI1,62.3,58.0,6.61,6.52,4.09,5408
26963,26964,0.33,Ideal,H,IF,61.9,55.0,4.44,4.42,2.74,1114
26964,26965,0.51,Premium,E,VS2,61.7,58.0,5.12,5.15,3.17,1656
26965,26966,0.27,Very Good,F,VVS2,61.8,56.0,4.19,4.20,2.60,682


In [6]:
df=df.drop(labels=['id'],axis=1)

In [7]:
X=df.drop(labels=['price'],axis=1)
Y=df[['price']]

In [8]:
Y
 

Unnamed: 0,price
0,499
1,984
2,6289
3,1082
4,779
...,...
26962,5408
26963,1114
26964,1656
26965,682


In [9]:
# Define which column should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [10]:
# Define the custom ranking for each ordinal variable
cut_categories =['Fair','Good','Very Good','Premium','Ideal']
color_categories =['D','E','F','G','H','I','J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [11]:
# Handling missing values
from sklearn.impute import SimpleImputer

#Handling feature Scaling
from sklearn.preprocessing import StandardScaler
# handling ordinal encoding
from sklearn.preprocessing import OrdinalEncoder


from sklearn.pipeline import Pipeline
# to combine two different pipeline
from sklearn.compose import ColumnTransformer

In [12]:
## Numerical pipeline
num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler()),
    ]
)

## Categorical pipeline
cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('Ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])

In [13]:
## Train tets split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [14]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [15]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,0.561674,0.318936,-0.651005,0.704838,0.692859,0.723432,0.983583,0.226283,0.576148
1,-0.604087,-0.397691,1.589878,-0.533052,-0.486169,-0.533329,-0.812066,-0.945547,-0.639724
2,-0.520818,-0.254365,-1.099182,-0.409263,-0.376688,-0.410385,0.983583,0.812198,-1.247659
3,-0.999613,0.247274,-1.099182,-1.178523,-1.134635,-1.107067,0.983583,0.812198,0.576148
4,0.228599,0.963901,-0.651005,0.413049,0.330729,0.477544,-0.812066,0.812198,-0.031788


In [16]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [17]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [18]:
regression.coef_

array([[4989.90118006, -117.54738915,  -71.5693016 , -811.65093156,
          23.86496768,  -22.98019608,  128.63802707, -551.41625636,
         835.50098553]])

In [19]:
regression.intercept_

array([3944.254662])

In [20]:
    import numpy as np
    def evaluate_model(true,predicted):
        mae = mean_absolute_error(true,predicted)
        mse=mean_squared_error(true,predicted)
        rmse=np.sqrt(mean_squared_error(true,predicted))
        r2_square=r2_score(true,predicted)
        return  mae,mse,rmse,r2_square


In [21]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Your models dictionary
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(),
}

trained_model_list = []
model_list = []
r2_list = []

for model_name, model in models.items():
    model.fit(X_train, y_train)
    
    # Make prediction
    y_pred = model.predict(X_test)
    mae, mse, rmse, r2_square = evaluate_model(y_test, y_pred)
    
    print(model_name)
    model_list.append(model_name)
    print("Model training performance")
    print("MAE:", mae)
    print("MSE:", mse)
    print("RMSE:", rmse)
    print("R² score:", r2_square * 100)
    
    print('=' * 35)
    print('\n')


LinearRegression
Model training performance
MAE: 813.809226685926
MSE: 1391561.362208422
RMSE: 1179.644591480172
R² score: 91.4417386050815


Lasso
Model training performance
MAE: 815.4198090362225
MSE: 1394371.672450772
RMSE: 1180.8351588815317
R² score: 91.42445487594959


Ridge
Model training performance
MAE: 814.0750740594788
MSE: 1391982.7923832168
RMSE: 1179.823203867095
R² score: 91.43914676134858


ElasticNet
Model training performance
MAE: 1079.23871584633
MSE: 2654517.353624659
RMSE: 1629.2689629476954
R² score: 83.6744149366052




In [25]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Function to evaluate model performance
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    r2_square = r2_score(y_true, y_pred)
    return mae, mse, rmse, r2_square

# Your models dictionary including XGBoost and DecisionTree
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(),
    'DecisionTree': DecisionTreeRegressor(),
    'XGBoost': XGBRegressor()
}

model_list = []
train_r2_list = []
test_r2_list = []

for model_name, model in models.items():
    model.fit(X_train, y_train)
    
    # Predictions on training set
    y_train_pred = model.predict(X_train)
    train_mae, train_mse, train_rmse, train_r2_square = evaluate_model(y_train, y_train_pred)
    
    # Predictions on test set
    y_test_pred = model.predict(X_test)
    test_mae, test_mse, test_rmse, test_r2_square = evaluate_model(y_test, y_test_pred)
    
    # Append model names and R² scores for comparison
    model_list.append(model_name)
    train_r2_list.append(train_r2_square * 100)
    test_r2_list.append(test_r2_square * 100)
    
    # Print model performance on training data
    print(f'{model_name} - Training Performance')
    print("MAE:", train_mae)
    print("MSE:", train_mse)
    print("RMSE:", train_rmse)
    print("R² score:", train_r2_square * 100)
    
    print("\n")

    # Print model performance on test data
    print(f'{model_name} - Test Performance')
    print("MAE:", test_mae)
    print("MSE:", test_mse)
    print("RMSE:", test_rmse)
    print("R² score:", test_r2_square * 100)
    
    print('=' * 50)
    print('\n')

# Check overfitting by comparing train and test R² scores
for i in range(len(model_list)):
    print(f"{model_list[i]}: Training R²: {train_r2_list[i]:.2f}% | Test R²: {test_r2_list[i]:.2f}%")
    if train_r2_list[i] > test_r2_list[i] + 5:
        print(f"{model_list[i]} is likely overfitting (R² difference: {train_r2_list[i] - test_r2_list[i]:.2f}%).")
    elif test_r2_list[i] > train_r2_list[i]:
        print(f"{model_list[i]} may be underfitting (Test R² > Training R²).")
    else:
        print(f"{model_list[i]} generalizes well.")
    print('-' * 50)


LinearRegression - Training Performance
MAE: 818.6417834221716
MSE: 1531699.4614603932
RMSE: 1237.6184636067746
R² score: 90.52913116690597


LinearRegression - Test Performance
MAE: 813.809226685926
MSE: 1391561.362208422
RMSE: 1179.644591480172
R² score: 91.4417386050815


Lasso - Training Performance
MAE: 819.9850354005648
MSE: 1531837.5771855072
RMSE: 1237.674261340805
R² score: 90.52827716391828


Lasso - Test Performance
MAE: 815.4198090362225
MSE: 1394371.672450772
RMSE: 1180.8351588815317
R² score: 91.42445487594959


Ridge - Training Performance
MAE: 818.8645794271271
MSE: 1531701.6184378164
RMSE: 1237.6193350290778
R² score: 90.52911782979203


Ridge - Test Performance
MAE: 814.0750740594788
MSE: 1391982.7923832168
RMSE: 1179.823203867095
R² score: 91.43914676134858


ElasticNet - Training Performance
MAE: 1075.7395377578027
MSE: 2695927.097455478
RMSE: 1641.9278600034406
R² score: 83.33042965279847


ElasticNet - Test Performance
MAE: 1079.23871584633
MSE: 2654517.353624659


In [27]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Function to evaluate model performance
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    r2_square = r2_score(y_true, y_pred)
    return mae, mse, rmse, r2_square

# Your models dictionary including XGBoost and DecisionTree
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(),
    'DecisionTree': DecisionTreeRegressor(),
    'XGBoost': XGBRegressor()
} 

model_list = []
train_r2_list = []
test_r2_list = []
residuals = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    
    # Predictions on training set
    y_train_pred = model.predict(X_train)
    train_mae, train_mse, train_rmse, train_r2_square = evaluate_model(y_train, y_train_pred)
    
    # Predictions on test set
    y_test_pred = model.predict(X_test)
    test_mae, test_mse, test_rmse, test_r2_square = evaluate_model(y_test, y_test_pred)
    
    # Append model names and R² scores for comparison
    model_list.append(model_name)
    train_r2_list.append(train_r2_square * 100)
    test_r2_list.append(test_r2_square * 100)
    
    # Store residuals for plotting
    residuals[model_name] = y_test - y_test_pred
    
# 1. R² Score Comparison (Training vs Test)
plt.figure(figsize=(10, 6))
bar_width = 0.35
index = range(len(model_list))

plt.bar(index, train_r2_list, bar_width, label='Training R²')
plt.bar([i + bar_width for i in index], test_r2_list, bar_width, label='Test R²')

plt.xlabel('Models')
plt.ylabel('R² Score (%)')
plt.title('R² Score Comparison (Training vs Test)')
plt.xticks([i + bar_width / 2 for i in index], model_list, rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

# 2. Residual Plot (Test Residuals for Each Model)
plt.figure(figsize=(12, 8))

# Iterate through residuals dictionary, plot one by one
for model_name, y_test_pred in residuals.items():
    # Compute residuals for the current model
    residuals_for_model = y_test - y_test_pred
    
    # Plot residuals for the model
    sns.residplot(x=y_test, y=residuals_for_model, lowess=True, label=model_name)

# Add horizontal line at y = 0 for reference
plt.axhline(0, color='black', linestyle='--', linewidth=2)
plt.title('Residual Plot for Test Data')
plt.xlabel('Actual Values')
plt.ylabel('Residuals')
plt.legend(loc='upper right')
plt.tight_layout()
plt.show()


# 3. Learning Curves for XGBoost (as an example)
from sklearn.model_selection import learning_curve
import numpy as np

train_sizes, train_scores, test_scores = learning_curve(XGBRegressor(), X_train, y_train, cv=5, scoring='r2', n_jobs=-1,
                                                        train_sizes=np.linspace(0.1, 1.0, 10), random_state=42)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_mean, 'o-', color="g", label="Cross-validation score")

plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color="g")

plt.title("Learning Curve (XGBoost)")
plt.xlabel("Training Size")
plt.ylabel("R² Score")
plt.legend(loc="best")
plt.tight_layout()
plt.show()


ValueError: Unable to coerce to Series, length must be 1: given 8091

In [2]:
import pickle

with open('C:/Users/ashwi/OneDrive/Desktop/diamondPricePredictor/artifacts/model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

# Now you can use the loaded model for predictions
predictions = loaded_model.predict(X_test)

NameError: name 'X_test' is not defined

In [3]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler

# Sample data
data = {
    'id': [9923],
    'carat': [1.1],
    'cut': ['Very Good'],
    'color': ['F'],
    'clarity': ['SI1'],
    'depth': [60.5],
    'table': [60.0],
    'x': [6.65],
    'y': [6.67],
    'z': [4.03],
    'price': [5667]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Preprocess the data
def preprocess_data(df):
    # Encode categorical variables
    df['cut'] = pd.Categorical(df['cut'], categories=['Fair', 'Good', 'Very Good', 'Premium', 'Ideal'], ordered=True)
    df['color'] = pd.Categorical(df['color'], categories=['J', 'I', 'H', 'G', 'F', 'E', 'D'], ordered=True)
    df['clarity'] = pd.Categorical(df['clarity'], categories=['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'], ordered=True)
    
    # One-hot encode categorical variables
    df = pd.get_dummies(df, columns=['cut', 'color', 'clarity'])
    
    # Select features (excluding 'id' and 'price')
    features = df.drop(['id', 'price'], axis=1)
    
    # Scale numerical features
    scaler = StandardScaler()
    numerical_features = ['carat', 'depth', 'table', 'x', 'y', 'z']
    features[numerical_features] = scaler.fit_transform(features[numerical_features])
    
    return features

# Preprocess the sample data
X = preprocess_data(df)

# Load the trained model
try:
    with open('C:/Users/ashwi/OneDrive/Desktop/diamondPricePredictor/artifacts/model.pkl', 'rb') as model_file:
        model = pickle.load(model_file)
except FileNotFoundError:
    print("Model file not found. Please ensure that you have trained and saved the model.")
    exit()

# Make prediction
prediction = model.predict(X)

print(f"Predicted price: ${prediction[0]:.2f}")
print(f"Actual price: ${df['price'].values[0]}")
print(f"Difference: ${abs(prediction[0] - df['price'].values[0]):.2f}")

NotFittedError: need to call fit or load_model beforehand