In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


: 

In [None]:
df = pd.read_csv('Datasets/cleaned_laptop_price_data.csv')  # use your file name
df.head()


In [None]:
df.dtypes


In [None]:
X = df.drop('price_euros', axis=1)
y = df['price_euros']



In [None]:
X = pd.get_dummies(X)



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)


In [None]:
model = LinearRegression()
model.fit(X_train, y_train)




In [None]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("RMSE:", rmse)



In [None]:
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted Prices')
plt.show()



In [None]:
from sklearn.metrics import r2_score
print("R² Score:", r2_score(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestRegressor



In [None]:
rf_model = RandomForestRegressor(random_state=30)
rf_model.fit(X_train, y_train)



In [None]:
y_pred = rf_model.predict(X_test)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("RMSE:", rmse)
print("R² Score:", r2)



In [None]:
import sys
!{sys.executable} -m pip install xgboost
!pip install xgboost


In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score



In [None]:
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)




In [None]:
y_pred = xgb_model.predict(X_test)



In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print("RMSE:", rmse)
print("R² Score:", r2)



In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# Define model
xgb = XGBRegressor(random_state=30)

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2]
}

# Grid search
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid,
                           cv=3, scoring='neg_root_mean_squared_error', verbose=1)

# Fit
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

In [None]:
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Tuned RMSE:", rmse)
print("Tuned R² Score:", r2)



In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='teal', alpha=0.6)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted Laptop Prices (XGBoost)")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red')  # Line of perfect prediction
plt.grid(True)
plt.tight_layout()
plt.show()



In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)  # Diagonal line
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted Prices with Reference Line")
plt.show()



In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Get feature importances from the model
importances = best_model.feature_importances_

# Create a pandas series for easier plotting
feature_importance_series = pd.Series(importances, index=X_train.columns)

# Sort by importance
feature_importance_series = feature_importance_series.sort_values(ascending=False)

# Plot
plt.figure(figsize=(10,6))
feature_importance_series.plot(kind='bar')
plt.title('Feature Importance')
plt.ylabel('Importance Score')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Predict on test data
y_pred = best_model.predict(X_test)

# Calculate residuals (errors)
residuals = y_test - y_pred

# Plot residuals histogram
plt.figure(figsize=(10,5))
sns.histplot(residuals, bins=30, kde=True)
plt.title('Residuals Distribution')
plt.xlabel('Error (Actual - Predicted)')
plt.show()

# Residuals vs Predicted values scatter plot
plt.figure(figsize=(10,5))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Residuals vs Predicted Values')
plt.xlabel('Predicted Price')
plt.ylabel('Residuals (Error)')
plt.show()

In [None]:
import joblib

joblib.dump(best_model, 'xgb_model.pkl')



In [None]:
import matplotlib.pyplot as plt


plt.scatter(y_test, y_pred)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted Laptop Prices')

# Save the plot to a file in Colab's virtual environment
plt.savefig('actual_vs_predicted.png')

plt.show()

In [None]:
from google.colab import files
files.download('actual_vs_predicted.png')



In [None]:
import matplotlib.pyplot as plt
from google.colab import files
import numpy as np



plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')  # Red diagonal line
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted Laptop Prices')

plt.savefig('actual_vs_predicted.png')  # Save the figure

plt.show()

files.download('actual_vs_predicted.png')  # Download the saved image



In [None]:
from google.colab import drive
drive.mount('/content/drive')



In [None]:
import os

# Define the folder path
folder_path = '/content/drive/MyDrive/Data Model Building'

In [None]:
import os
import joblib

# Define the folder path
folder_path = '/content/drive/MyDrive/Data Model Building'

# Create the directory if it doesn't exist
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Save the model
joblib.dump(xgb_model, os.path.join(folder_path, 'xgb_model.pkl'))

In [None]:
import matplotlib.pyplot as plt
import os

# Define the folder path (same as before)
folder_path = '/content/drive/MyDrive/Data Model Building'

# Make sure folder exists
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Your plot code here (example)
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, alpha=0.6) # Use y_test and y_pred
min_val = min(y_test)
max_val = max(y_test)
plt.plot([min_val, max_val], [min_val, max_val], color='red', linestyle='--')  # diagonal line
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted Prices')

# Save plot as PNG in the folder
plot_path = os.path.join(folder_path, 'actual_vs_predicted.png')
plt.savefig(plot_path)

plt.show()

Model Building Summary (Colab Session)

Steps Taken:

- Mounted Google Drive to access and save files.
- Loaded Cleaned Data: cleaned_laptop_price_data.csv
- Encoded Categorical Variables using Label Encoding (no binary encoding used).
- Explored and Visualized Data using:
  - Correlation heatmap
  - Diagonal plot (Actual vs Predicted)
  - Residual error plots
  - Feature importance plot from XGBoost
- Tried Multiple Regression Models:
  - Linear Regression
  - Random Forest Regressor
  - XGBoost Regressor
- Tuned Hyperparameters for XGBoost to improve performance.
- Evaluated Models using R² score, residual plots, and prediction scatter plots.
- Identified Influential Features using feature importance plot.
- Saved the Best Model (xgb_model.pkl) to:
  MyDrive/Data Model Building/
- Saved Visualizations (e.g., diagonal plot) as .png files to the same Drive folder.

Final Model:
- Model: XGBoost Regressor (tuned)
- Filename: xgb_model.pkl
- Stored In: Google Drive → Data Model Building


