# Imports and Instantiation

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from sklearn.pipeline import make_pipeline
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge

import matplotlib.pyplot as plt
import re
from rich import print
from rich.table import Table
from rich.console import Console
import seaborn as sns
from tabulate import tabulate

console = Console()

# Loading the Data



In [None]:
# Defining the file paths of the train and test datasets
train_set = "data/final_train_data.csv"
test_set = "data/final_test_data.csv"

# Reading the Datasets into their respective Pandas Dataframes
train_data = pd.read_csv(train_set, header=0)
test_data = pd.read_csv(test_set, header=0)

# Dropping the 'Number' Column as it is surplus to requirements
train_data = train_data.drop('Number', axis=1)
test_data = test_data.drop('Number', axis=1)

# Storing all the features in a list of attributes
attributes = train_data.columns.tolist()

# Reorder the columns in test_data to match the column order of train_data
test_data = test_data[attributes]

# Data Standardization

In [None]:
def standardize_features(data, features_to_standardize):
    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Fit the scaler to the selected features and transform them
    scaled_features = scaler.fit_transform(data[features_to_standardize])

    # Create a copy of the original data to retain non-standardized features
    scaled_data = data.copy()

    # Replace the selected features with the standardized values
    scaled_data.loc[:, features_to_standardize] = scaled_features

    return scaled_data

features_to_standardize = []
for column_name in train_data.columns:
    if 'one_hot' not in column_name and column_name != 'Price':
        features_to_standardize.append(column_name)

standardized_train_data = standardize_features(train_data, features_to_standardize)
standardized_test_data = standardize_features(test_data, features_to_standardize)

# Feature Selection Method(s)

In [None]:
def ufs_feature_selection(train_data, n_features):
    # Initialize UFS
    selector = SelectKBest(score_func=f_regression, k=n_features)

    # Fit UFS to the training data
    selector.fit(train_data.iloc[:, :-1], train_data.iloc[:, -1])

    # Get selected features based on UFS
    selected_features = train_data.columns[selector.get_support(indices=True)]

    # Append the last column of the original dataset to selected features
    selected_features = list(selected_features) + [train_data.columns[-1]]

    return selected_features

# Error Computation

In [None]:
def RMSE(labels, pred):
    mse_value = mean_squared_error(labels, pred)

    # Calculate RMSE
    rmse_value = np.sqrt(mse_value).round(4)

    return rmse_value

def MAE(labels, pred):
    mae = mean_absolute_error(labels, pred).round(4)

    return mae

def R2E(labels, pred):
    r_squared_value = r2_score(labels, pred).round(4)

    return r_squared_value

# Model Performance on Test Data

## Trivial Model 

In [None]:
class TrivialModel:
    def __init__(self, data):
        self.feature_data = data.iloc[:, :-1].copy()
        self.labels = data.iloc[:, -1].copy()
        self.labels = self.labels.round(2)

    def fit(self):
        self.output = self.labels.mean().round(2)

    def predict(self, user_input):
        return self.output

    def RMSE(self, labels, pred):
        mse_value = mean_squared_error(labels, pred)

        # Calculate RMSE
        rmse_value = np.sqrt(mse_value)

        return rmse_value

    def MAE(self, labels, pred):
        mae = mean_absolute_error(labels, pred)

        return mae

    def R2E(self, labels, pred):
        r_squared_value = r2_score(labels, pred).round(2)

        return r_squared_value


def trivial_system(train_data, test_data):
    # Shuffle the DataFrame (optional but recommended)
    train_data = train_data.sample(frac=1).reset_index(drop=True)

    test_inputs = test_data.iloc[:, :-1].copy()
    test_labels = test_data.iloc[:, -1].copy().round(2)

    model = TrivialModel(train_data)
    model.fit()

    predictions = []

    for _, data_pt in test_inputs.iterrows():
        output = model.predict(data_pt)
        predictions.append(output)

    console.print(f'Trivial Model\'s Laptop Price Prediction: {output}\n\n')

    console.print('Performance on Testing Set: \n')
    
    rmse = model.RMSE(test_labels, predictions)
    mae = model.MAE(test_labels, predictions)
    r2e = model.R2E(test_labels, predictions)
    
    att_table = Table()
    att_table.add_column(f'RMSE', style='blue')
    att_table.add_column(f'MAE', style='green')
    att_table.add_column(f'R-squared Error', style='red')

    att_table.add_row(str(rmse.round(4)), str(mae.round(4)), str(abs(r2e)))

    console.print(att_table)


trivial_system(train_data, test_data)

## 1-Nearest Neighbor (Baseline)

In [None]:
def nearest_neighbour_system(train_data, test_data):
    # Split data into training and testing sets
    train_inputs = train_data.iloc[:, :-1].copy()
    train_labels = train_data.iloc[:, -1].copy()

    test_inputs = test_data.iloc[:, :-1].copy()
    test_labels = test_data.iloc[:, -1].copy().round(2)

    # Initialize the nearest neighbor model
    model = KNeighborsRegressor(n_neighbors=1)

    # Train the model
    model.fit(train_inputs, train_labels)

    # Predict on test set
    test_predictions = model.predict(test_inputs)

    # Calculate error metrics for test set
    test_rmse = np.sqrt(mean_squared_error(test_labels, test_predictions))
    test_mae = mean_absolute_error(test_labels, test_predictions)
    test_r2e = r2_score(test_labels, test_predictions)

    console.print("\n[blue]Error Metrics for Test Set:[/blue]\n")
    print_error_metrics(test_rmse, test_mae, test_r2e)

def print_error_metrics(rmse, mae, r2e):
    att_table = Table(title="Error Metrics", show_header=True, header_style="bold magenta")
    att_table.add_column("Metric", style="cyan", justify="center")
    att_table.add_column("Value", style="cyan", justify="center")

    att_table.add_row("RMSE", f"{rmse:.4f}")
    att_table.add_row("MAE", f"{mae:.4f}")
    att_table.add_row("R-squared Error", f"{r2e:.4f}")

    console.print(att_table)

selected_features = ufs_feature_selection(standardized_train_data, 22)
train_data_selected = standardized_train_data[selected_features].copy()
test_data_selected = standardized_test_data[selected_features].copy()
nearest_neighbour_system(train_data_selected, test_data_selected)

## Linear Regression Model (Baseline)

In [None]:
def linear_regression_system(train_data, test_data):
    # Train the model on the entire training set
    train_inputs = train_data.iloc[:, :-1].copy()
    train_labels = train_data.iloc[:, -1].copy()

    model = LinearRegression()
    model.fit(train_inputs, train_labels)

    # Predictions on test set
    test_inputs = test_data.iloc[:, :-1].copy()
    test_labels = test_data.iloc[:, -1].copy().round(2)
    predictions = model.predict(test_inputs)

    # Find the range of values in train_labels
    min_value = train_labels.min()
    max_value = train_labels.max()

    # Add tolerance of 20000 to the range
    min_value -= 5000
    max_value += 5000

    # Calculate the mean of all values in train_labels
    mean_value = train_labels.mean()

    # Replace values in predictions outside of the range (with tolerance) with the mean value
    predictions = np.where((predictions < min_value) | (predictions > max_value), mean_value, predictions)

    # Calculate error metrics for the test set
    rmse = np.sqrt(mean_squared_error(test_labels, predictions))
    mae = mean_absolute_error(test_labels, predictions)
    r2e = r2_score(test_labels, predictions)

    # Print the results table for the test data
    console.print("\n[blue]Performance on Testing Set:[/blue]")

    att_table = Table(title="Error Metrics", show_header=True, header_style="bold magenta")
    att_table.add_column("Metric", style="cyan", justify="center")
    att_table.add_column("Value", style="cyan", justify="center")

    att_table.add_row("RMSE", f"{rmse:.4f}")
    att_table.add_row("MAE", f"{mae:.4f}")
    att_table.add_row("R-squared Error", f"{r2e:.4f}")

    console.print(att_table)

selected_features = ufs_feature_selection(standardized_train_data, 22)
train_data_selected = standardized_train_data[selected_features].copy()
test_data_selected = standardized_test_data[selected_features].copy()
linear_regression_system(train_data_selected, test_data_selected)

## Ridge Regression: Highest Performing Model on the Test Dataset

In [None]:
def ridge_regression_system(train_data, test_data):
    # Split data into training and testing sets
    train_inputs = train_data.iloc[:, :-1].copy()
    train_labels = train_data.iloc[:, -1].copy()

    test_inputs = test_data.iloc[:, :-1].copy()
    test_labels = test_data.iloc[:, -1].copy().round(2)

    # Initialize the linear regression model
    alpha = 0.75  # Regularization strength, you can adjust this value
    model = Ridge(alpha=alpha)

    # Train the model
    model.fit(train_inputs, train_labels)

    # Predict on test set
    test_predictions = model.predict(test_inputs)

    # Calculate error metrics for test set
    test_rmse = np.sqrt(mean_squared_error(test_labels, test_predictions))
    test_mae = mean_absolute_error(test_labels, test_predictions)
    test_r2e = r2_score(test_labels, test_predictions)

    console.print("\n[blue]Error Metrics for Test Set:[/blue]\n")
    print_error_metrics(test_rmse, test_mae, test_r2e)

def print_error_metrics(rmse, mae, r2e):
    att_table = Table(title="Error Metrics", show_header=True, header_style="bold magenta")
    att_table.add_column("Metric", style="cyan", justify="center")
    att_table.add_column("Value", style="cyan", justify="center")

    att_table.add_row("RMSE", f"{rmse:.4f}")
    att_table.add_row("MAE", f"{mae:.4f}")
    att_table.add_row("R-squared Error", f"{r2e:.4f}")

    console.print(att_table)

selected_features = ufs_feature_selection(standardized_train_data, 22)
train_data_selected = standardized_train_data[selected_features].copy()
test_data_selected = standardized_test_data[selected_features].copy()
ridge_regression_system(train_data_selected, test_data_selected)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=9992d859-d1f8-4007-afdd-e66541c42881' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>