<a href="https://colab.research.google.com/github/strathmore-uni/DiamondsPriceLinear/blob/main/Diamonds_Price_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.decomposition import PCA

# --- Q1: Download and Upload the Diamonds Dataset ---
# Assuming 'diamonds.csv' is already uploaded to your Colab environment.
df = pd.read_csv('diamonds.csv')

# --- Q2: Data Cleaning ---
# 1. Drop the first unnamed index column (if it exists)
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

# 2. Drop rows where 'x', 'y', 'z' (dimensions), 'carat', or 'price' are zero/invalid.
df_cleaned = df[~((df['x']==0) | (df['y']==0) | (df['z']==0) | (df['carat']==0) | (df['price']==0))].copy()

# --- Q2: Exploratory Data Analysis (3 Insights) ---

# Insight 1: Correlation of numerical features with 'price'
numeric_df = df_cleaned.select_dtypes(include=np.number)
correlation_matrix = numeric_df.corr()
price_correlations = correlation_matrix['price'].sort_values(ascending=False)
print("Insight 1: Correlation of numerical features with 'price':\n", price_correlations)

# Insight 2: Distribution of 'price' (Histogram)
plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned['price'], bins=50, kde=True)
plt.title('Distribution of Diamond Prices')
plt.xlabel('Price (USD)')
plt.ylabel('Frequency')
plt.savefig('price_distribution.png')
plt.close()
print("\nInsight 2: Price distribution histogram saved as 'price_distribution.png' (for visualization in your report).")

# Insight 3: Median Price by 'cut' quality
cut_median_price = df_cleaned.groupby('cut')['price'].median().sort_values(ascending=False)
print("\nInsight 3: Median Price by 'cut' (Order of median prices):\n", cut_median_price)


# --- Q2: Create Sample for Modelling ---
sample_size = 12500
diamonds_model = df_cleaned.sample(n=sample_size, random_state=42).copy()

# Save the sampled data (optional, but good practice for consistency)
diamonds_model.to_csv('diamonds_model.csv', index=False)
print(f"\nDataFrame 'diamonds_model' created with {len(diamonds_model)} records and saved as 'diamonds_model.csv'.")

Insight 1: Correlation of numerical features with 'price':
 price    1.000000
carat    0.921592
x        0.887231
z        0.868206
y        0.867864
table    0.127245
depth   -0.010729
Name: price, dtype: float64

Insight 2: Price distribution histogram saved as 'price_distribution.png' (for visualization in your report).

Insight 3: Median Price by 'cut' (Order of median prices):
 cut
Fair         3282.0
Premium      3182.0
Good         3050.5
Very Good    2647.0
Ideal        1809.5
Name: price, dtype: float64

DataFrame 'diamonds_model' created with 12500 records and saved as 'diamonds_model.csv'.


In [6]:
# Load the sampled data
diamonds_model = pd.read_csv('diamonds_model.csv')
model_results = {}

# --- Feature Engineering and Data Preparation for Q3, Q5 ---

# 1. One-Hot Encode Categorical Variables (dropping one level per feature to avoid multicollinearity)
diamonds_model_encoded = pd.get_dummies(diamonds_model, columns=['cut', 'color', 'clarity'], drop_first=True)

# Define features (X) and target (y)
X = diamonds_model_encoded.drop('price', axis=1)
y = diamonds_model_encoded['price']

# 2. Split the data (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3. Standardization (Scaling the features)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# --- Q3: Train and Verify Linear Regression Model ---
lr_model_q3 = LinearRegression()
lr_model_q3.fit(X_train_scaled, y_train)
y_pred_q3 = lr_model_q3.predict(X_test_scaled)

# Verification
r2_q3 = r2_score(y_test, y_pred_q3)
mae_q3 = mean_absolute_error(y_test, y_pred_q3)
mse_q3 = mean_squared_error(y_test, y_pred_q3)
model_results['Q3_LR_All_Features'] = {'R2': r2_q3, 'MAE': mae_q3, 'MSE': mse_q3}

print("--- Question 3: Linear Regression on All Features Results ---")
print(f"R-squared (R2): {r2_q3:.4f}")
print(f"Mean Absolute Error (MAE): ${mae_q3:.2f}")

--- Question 3: Linear Regression on All Features Results ---
R-squared (R2): 0.9281
Mean Absolute Error (MAE): $710.66


In [7]:
# --- Q4: PCA and Linear Regression on 2 Features ---

# 1. Select Continuous Features
continuous_features = ['carat', 'x', 'y', 'z', 'table', 'depth']
X_cont = diamonds_model[continuous_features]

# 2. Standardize Continuous Features
scaler_pca = StandardScaler()
X_cont_scaled = scaler_pca.fit_transform(X_cont)

# 3. Apply PCA to select 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_cont_scaled)
X_pca_df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
y_pca = diamonds_model['price']

# 4. Split PCA data
X_pca_train, X_pca_test, y_pca_train, y_pca_test = train_test_split(X_pca_df, y_pca, test_size=0.3, random_state=42)

# 5. Train Linear Regression Model on PCA components
lr_model_q4 = LinearRegression()
lr_model_q4.fit(X_pca_train, y_pca_train)
y_pred_q4 = lr_model_q4.predict(X_pca_test)

# Verification
r2_q4 = r2_score(y_pca_test, y_pred_q4)
mae_q4 = mean_absolute_error(y_pca_test, y_pred_q4)
mse_q4 = mean_squared_error(y_pca_test, y_pred_q4)
model_results['Q4_LR_PCA_2_Features'] = {'R2': r2_q4, 'MAE': mae_q4, 'MSE': mse_q4}

print("\n--- Question 4: PCA and Linear Regression on 2 Features Results ---")
print(f"R-squared (R2): {r2_q4:.4f}")
print(f"Mean Absolute Error (MAE): ${mae_q4:.2f}")
print(f"Variance explained by PC1 and PC2: {pca.explained_variance_ratio_.sum():.4f}")


--- Question 4: PCA and Linear Regression on 2 Features Results ---
R-squared (R2): 0.8077
Mean Absolute Error (MAE): $1276.92
Variance explained by PC1 and PC2: 0.8638


In [8]:
# Note: X_train_scaled, X_test_scaled, y_train, y_test from Step 2 are used here.
alpha = 1.0 # Common alpha value for demonstration

# --- Q5: Lasso Regression ---
lasso_model = Lasso(alpha=alpha, random_state=42, max_iter=10000)
lasso_model.fit(X_train_scaled, y_train)
y_pred_lasso = lasso_model.predict(X_test_scaled)

# Verification
r2_lasso = r2_score(y_test, y_pred_lasso)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
model_results['Q5_Lasso_All_Features'] = {'R2': r2_lasso, 'MAE': mae_lasso, 'MSE': mse_lasso}

print("\n--- Question 5: Lasso Regression Results (alpha=1.0) ---")
print(f"R-squared (R2): {r2_lasso:.4f}")
print(f"Mean Absolute Error (MAE): ${mae_lasso:.2f}")

# --- Q5: Ridge Regression ---
ridge_model = Ridge(alpha=alpha, random_state=42)
ridge_model.fit(X_train_scaled, y_train)
y_pred_ridge = ridge_model.predict(X_test_scaled)

# Verification
r2_ridge = r2_score(y_test, y_pred_ridge)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
model_results['Q5_Ridge_All_Features'] = {'R2': r2_ridge, 'MAE': mae_ridge, 'MSE': mse_ridge}

print("\n--- Question 5: Ridge Regression Results (alpha=1.0) ---")
print(f"R-squared (R2): {r2_ridge:.4f}")
print(f"Mean Absolute Error (MAE): ${mae_ridge:.2f}")


# --- Q6: Model Comparison ---
df_results = pd.DataFrame.from_dict(model_results, orient='index')
df_results.to_csv('model_comparison_results.csv', index_label='Model')

print("\n--- Question 6: All Model Comparison Summary ---")
print(df_results)
print("\nFull comparison table saved to 'model_comparison_results.csv'.")


--- Question 5: Lasso Regression Results (alpha=1.0) ---
R-squared (R2): 0.9279
Mean Absolute Error (MAE): $710.37

--- Question 5: Ridge Regression Results (alpha=1.0) ---
R-squared (R2): 0.9281
Mean Absolute Error (MAE): $710.96

--- Question 6: All Model Comparison Summary ---
                             R2          MAE           MSE
Q3_LR_All_Features     0.928104   710.655207  1.170727e+06
Q4_LR_PCA_2_Features   0.807690  1276.923613  3.131498e+06
Q5_Lasso_All_Features  0.927910   710.373715  1.173885e+06
Q5_Ridge_All_Features  0.928075   710.955195  1.171198e+06

Full comparison table saved to 'model_comparison_results.csv'.
