# Imports and Instantiation

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

import matplotlib.pyplot as plt
import re
from rich import print
from rich.table import Table
from rich.console import Console
import seaborn as sns

console = Console()

# Loading the Data



In [None]:
# Defining the file paths of the train and test datasets
train_set = "data/eda_train_data.csv"
test_set = "data/eda_test_data.csv"

# Reading the Datasets into their respective Pandas Dataframes
train_data = pd.read_csv(train_set, header=0)
test_data = pd.read_csv(test_set, header=0)

# Dropping the 'Number' Column as it is surplus to requirements
train_data = train_data.drop('Number', axis=1)
test_data = test_data.drop('Number', axis=1)

# Storing all the features in a list of attributes
attributes = train_data.columns.tolist()

## Comparing test set features to training for encoding

In [None]:
def compare_unique_vals(train_data, test_data, col):
    # Get unique values from train and test
    unique_values_train = set(train_data[f'{col}'])
    unique_values_test = set(test_data[f'{col}'])

    # Find unique values in train that are not in test
    unique_in_train_not_in_test = unique_values_train - unique_values_test

    # Find unique values in test that are not in train
    unique_in_test_not_in_train = unique_values_test - unique_values_train

    # Find unique values common to both train and test
    common_unique_values = unique_values_train.intersection(unique_values_test)

    unique_in_train_not_in_test = unique_values_train - unique_values_test
    unique_in_test_not_in_train = unique_values_test - unique_values_train
    common_unique_values = unique_values_train.intersection(unique_values_test)

    print(f"For {col}: Unique values in train but not in test: {unique_in_train_not_in_test}\nTotal = {len(unique_in_train_not_in_test)}")
    print(f"For {col} — Unique values in test but not in train: {unique_in_test_not_in_train}\nTotal = {len(unique_in_test_not_in_train)}")
    print(f"For {col}: Common unique values: {common_unique_values}\nTotal = {len(common_unique_values)}")

    return unique_in_test_not_in_train

# For label encoding mapping
unseen_val_display = compare_unique_vals(train_data,test_data, 'Display')
unseen_val_mc1 = compare_unique_vals(train_data,test_data, 'MemoryComponent1')
unseen_val_mc2 = compare_unique_vals(train_data,test_data, 'MemoryComponent2')
unseen_val_cpu = compare_unique_vals(train_data,test_data, 'CpuModel')
unseen_val_gpu = compare_unique_vals(train_data,test_data, 'Gpu')

# print(train_data['Display'].unique())
# print(test_data['Display'].unique())

## Categorical Feature Encoding
Applying necessary encoding to all categorical data

In [None]:
def one_hot_test(test_data, train_data, col, col_idx):
    # Step 1: Extract unique features from the train dataset
    unique_features = train_data[col].unique()

    one_hot = {}
    for val in unique_features:
        one_hot[f"one_hot_{col}_{val}"] = (test_data[col] == val).astype(int)

    # Convert one_hot dictionary to DataFrame
    one_hot = pd.DataFrame(one_hot)

    test_data.drop(columns=[f'{col}'], inplace=True)
    test_data = pd.concat([test_data.iloc[:, :col_idx], one_hot, test_data.iloc[:, col_idx:]], axis=1)

    return test_data

def encoding(train_data, test_data):
    # Perform one-hot encoding on the 'Company' feature
    one_hot_encoded_company = pd.get_dummies(train_data['Company'], prefix='one_hot_Company').astype(int)
    
    # Store the column index and drop the 'Company' feature
    company_column_index = train_data.columns.get_loc('Company')
    test_data = one_hot_test(test_data, train_data, 'Company', company_column_index) # Encode for test
    train_data.drop(columns=['Company'], inplace=True)

    # Concatenate the one-hot encoded Column features with the original DataFrame at the specified index
    train_data = pd.concat([train_data.iloc[:, :company_column_index], one_hot_encoded_company, train_data.iloc[:, company_column_index:]], axis=1)

    # Perform one-hot encoding on the 'TypeName' feature
    one_hot_encoded_type = pd.get_dummies(train_data['TypeName'], prefix='one_hot_TypeName').astype(int)

    # Store the column index and drop the 'TypeName' feature
    type_column_index = train_data.columns.get_loc('TypeName')
    test_data = one_hot_test(test_data, train_data, 'TypeName', type_column_index) # Encode for test
    train_data.drop(columns=['TypeName'], inplace=True)

    # Concatenate the one-hot encoded TypeName features with the original DataFrame at the specified index
    train_data = pd.concat([train_data.iloc[:, :type_column_index], one_hot_encoded_type, train_data.iloc[:, type_column_index:]], axis=1)

    # Mapping the values of the Display to their ordered integer ranks based on their average prices

    Display_mapping = {}

    for value in unseen_val_display:
        Display_mapping[value] = -1

    # Fill nans to None
    train_data['Display'] = train_data['Display'].fillna('None')
    test_data['Display'] = test_data['Display'].fillna('None')

    # Computing the average price of a laptop when grouped by the unique values of Display
    average_prices = train_data.groupby('Display')['Price'].mean().sort_values()
    # Ranking the Display in ascending order of prices
    ordered_displays = average_prices.index.tolist()

    # Assign unique integer values to each string
    for i, string in enumerate(ordered_displays, start=1):
        Display_mapping[string] = i

    # Apply custom label encoding using map function                           
    train_data['Display'] = train_data['Display'].map(Display_mapping)
    test_data['Display'] = test_data['Display'].map(Display_mapping)

    # Mapping the values of the Memory Components to their ordered integer ranks based on their average prices

    mc1_mapping = {}

    for value in unseen_val_mc1:
        mc1_mapping[value] = -1

    #Computing the average price of a laptop when grouped by the unique values of Memory Component 1
    average_prices = train_data.groupby('MemoryComponent1')['Price'].mean().sort_values()
    # Ranking the memory components in ascending order of prices
    ordered_mc1 = average_prices.index.tolist()

    # Manually mapping None to 0 to fill all missing values for laptops don't have secondary memory
    mc2_mapping = {'None': 0}

    for value in unseen_val_mc2:
        mc2_mapping[value] = -1

    #Computing the average price of a laptop when grouped by the unique values of Memory Component 2
    average_prices = train_data.groupby('MemoryComponent2')['Price'].mean().sort_values()
    # Ranking the memory components in ascending order of prices
    ordered_mc2 = average_prices.index.tolist()

    # Assign unique integer values to each string
    for i, string in enumerate(ordered_mc1, start=1):
        mc1_mapping[string] = i

    # Assign unique integer values to each string
    for i, string in enumerate(ordered_mc2, start=1):
        mc2_mapping[string] = i

    # Apply custom label encoding using map function
    train_data['MemoryComponent1'] = train_data['MemoryComponent1'].map(mc1_mapping)
    train_data['MemoryComponent2'] = train_data['MemoryComponent2'].map(mc2_mapping)
    test_data['MemoryComponent1'] = test_data['MemoryComponent1'].map(mc1_mapping)
    test_data['MemoryComponent2'] = test_data['MemoryComponent2'].map(mc2_mapping)

    train_data['MemoryComponent2'] = train_data['MemoryComponent2'].fillna(mc2_mapping['None'])
    train_data['MemoryComponent2'] = train_data['MemoryComponent2'].astype(int)
    test_data['MemoryComponent2'] = test_data['MemoryComponent2'].fillna(mc2_mapping['None'])
    test_data['MemoryComponent2'] = test_data['MemoryComponent2'].astype(int)

    # Perform one-hot encoding on the 'OpSys' feature
    one_hot_encoded_os = pd.get_dummies(train_data['OpSys'], prefix='one_hot_OpSys').astype(int)

    # Store the column index and drop the 'OpSys' feature
    os_column_index = train_data.columns.get_loc('OpSys')
    test_data = one_hot_test(test_data, train_data, 'OpSys', type_column_index) # Encode for test
    train_data.drop(columns=['OpSys'], inplace=True)

    # Concatenate the one-hot encoded OpSys features with the original DataFrame at the specified index
    train_data = pd.concat([train_data.iloc[:, :os_column_index], one_hot_encoded_os, train_data.iloc[:, os_column_index:]], axis=1)

    # Mapping the values of the CPUs to their ordered integer ranks based on their average prices
    Cpu_mapping = {}

    for value in unseen_val_cpu:
        Cpu_mapping[value] = -1

    #Computing the average price of a laptop when grouped by the unique values of CPU
    average_prices = train_data.groupby('CpuModel')['Price'].mean().sort_values()
    # Ranking the CPUs in ascending order of prices
    ordered_cpus = average_prices.index.tolist()

    # Assign unique integer values to each string
    for i, string in enumerate(ordered_cpus, start=1):
        Cpu_mapping[string] = i

    # Apply custom label encoding using map function                           
    train_data['CpuModel'] = train_data['CpuModel'].map(Cpu_mapping)
    test_data['CpuModel'] = test_data['CpuModel'].map(Cpu_mapping)

    # Mapping the values of the GPUs to their ordered integer ranks based on their average prices
    Gpu_mapping = {}

    for value in unseen_val_gpu:
        Gpu_mapping[value] = -1

    #Computing the average price of a laptop when grouped by the unique values of GPU
    average_prices = train_data.groupby('Gpu')['Price'].mean().sort_values()
    # Ranking the GPUs in ascending order of prices
    ordered_gpus = average_prices.index.tolist()

    # Assign unique integer values to each string
    for i, string in enumerate(ordered_gpus, start=1):
        Gpu_mapping[string] = i

    # Apply custom label encoding using map function                           
    train_data['Gpu'] = train_data['Gpu'].map(Gpu_mapping)
    test_data['Gpu'] = test_data['Gpu'].map(Gpu_mapping)

    # # Initialize StandardScaler
    # scaler = StandardScaler()

    # # Fit the scaler to the data and transform the desired feature
    # # For example, let's standardize 'Feature1'
    # train_data['Ram'] = scaler.fit_transform(train_data[['Ram']])
    # train_data['Weight'] = scaler.fit_transform(train_data[['Weight']])
    # train_data['MemoryComponent1'] = scaler.fit_transform(train_data[['MemoryComponent1']])
    # train_data['MemoryComponent2'] = scaler.fit_transform(train_data[['MemoryComponent2']])

    return train_data, test_data

# # Get unique features for each column
# unique_features = {}
# for col in train_data.columns:
#     unique_features[col] = train_data[col].unique()

# print(unique_features)



### Encoding Training and Test Data

In [None]:
# Encoding training data
train_data, test_data = encoding(train_data, test_data)

In [None]:
# # # Print unique training attributes
# attributes = train_data.columns.tolist()

# for i, attribute in enumerate(attributes):
#     if attribute != 'Number' and attribute != 'Weight' and attribute != 'Price':
#         print(f'Feature {i}, x{i}: {attribute} — *(Total: {len(set(train_data[attribute]))})*\n {set(train_data[attribute])}\n\n')

# # Print unique testing attributes
attributes = test_data.columns.tolist()

for i, attribute in enumerate(attributes):
    if attribute == 'Display' or attribute == 'MemoryComponent1'or attribute == 'MemoryComponent2'or attribute == 'CpuModel' or attribute == 'Gpu':
        print(f'Feature {i}, x{i}: {attribute} — *(Total: {len(set(test_data[attribute]))})*\n {set(test_data[attribute])}\n\n')
    # if attribute != 'Number' and attribute != 'Weight' and attribute != 'Price':
    #     print(f'Feature {i}, x{i}: {attribute} — *(Total: {len(set(test_data[attribute]))})*\n {set(test_data[attribute])}\n\n')

In [None]:
# styled_df = train_data.style
styled_df = test_data.style

# Apply various formatting options
styled_df = styled_df.set_properties(**{'text-align': 'center'})  # Align text to center
styled_df = styled_df.set_table_styles([{'selector': 'th', 'props': [('font-size', '11pt')]}])  # Set font size for headers

# Display the styled DataFrame

styled_df

Unnamed: 0,one_hot_Company_Apple,one_hot_Company_HP,one_hot_Company_Acer,one_hot_Company_Asus,one_hot_Company_Dell,one_hot_Company_Lenovo,one_hot_Company_Chuwi,one_hot_Company_MSI,one_hot_Company_Microsoft,one_hot_Company_Toshiba,one_hot_Company_Huawei,one_hot_Company_Xiaomi,one_hot_Company_Vero,one_hot_Company_Razer,one_hot_Company_Mediacom,one_hot_Company_Samsung,one_hot_Company_Google,one_hot_Company_Fujitsu,one_hot_Company_LG,one_hot_OpSys_macOS,one_hot_OpSys_No OS,one_hot_OpSys_Windows 10,one_hot_OpSys_Mac OS X,one_hot_OpSys_Linux,one_hot_OpSys_Android,one_hot_OpSys_Windows 10 S,one_hot_OpSys_Chrome OS,one_hot_OpSys_Windows 7,one_hot_TypeName_Ultrabook,one_hot_TypeName_Notebook,one_hot_TypeName_Netbook,one_hot_TypeName_Gaming,one_hot_TypeName_2 in 1 Convertible,one_hot_TypeName_Workstation,Inches,Display,Touchscreen,ScreenArea,CpuModel,ClockSpeed,Ram,MemoryComponent1,MemoryComponent2,Gpu,Weight,Price
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,17.3,5,0,2073600,76,2.7,32,17,2,75,3.58,68145.12
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,13.3,3,1,2073600,72,1.3,16,17,0,63,1.22,87858.72
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,14.0,5,0,2073600,55,2.7,8,18,0,45,1.13,109170.72
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,15.6,5,0,2073600,43,2.5,8,19,0,45,1.95,104588.1072
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,15.6,5,1,2073600,55,2.7,16,19,0,45,1.08,111834.72
5,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,12.5,3,0,2073600,55,2.7,8,19,0,45,1.26,79014.24
6,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,15.6,5,0,2073600,1,1.5,4,6,0,7,2.2,19127.52
7,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,15.6,5,0,2073600,42,2.3,8,14,5,50,2.6,42037.92
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,14.0,5,1,2073600,55,2.7,8,19,0,45,0.98,101178.72
9,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,14.0,3,0,2073600,57,2.8,8,17,0,45,1.6,64202.4


# Export the Datasets

In [None]:
final_train_set = 'data/final_train_data.csv'
train_data.to_csv(final_train_set, index_label='Number')

final_test_set = 'data/final_test_data.csv'
test_data.to_csv(final_test_set, index_label='Number')

In [None]:
def compare_features(data_set, eda_data, eda2_data):

    # Storing all the features in a list of attributes
    attributes2 = data_set.columns.tolist()
    attributes = eda_data.columns.tolist()
    attributes3 = eda2_data.columns.tolist()

    print('Comparison of Before and After Pre-Processing')

    att_table = Table()
    att_table.add_column(f'Features Before: (Total = {len(attributes2)})')
    att_table.add_column(f'Features After Preprocessing: (Total = {len(attributes)})')
    att_table.add_column(f'Features After Encoding: (Total = {len(attributes3)})')

    # Find the minimum length between attributes and attributes2
    min_length1 = min(len(attributes), len(attributes2))
    min_length2 = min(len(attributes), len(attributes3))
    print(min_length2)

    # Add rows for attributes that are present in both lists
    for i in range(min_length1):
        att_table.add_row(
            f'{attributes2[i]}: {len(set(data_set[attributes2[i]]))}',
            f'{attributes[i]}: {len(set(eda_data[attributes[i]]))}',
            f'{attributes3[i]}: {len(set(eda2_data[attributes3[i]]))}'
        )

    # Add remaining attributes if it's longer
    if len(attributes3) > min_length1:
        for i in range(min_length1, len(attributes)):
            att_table.add_row(
                '', f'{attributes[i]}:{len(set(eda_data[attributes[i]]))}',
                f'{attributes3[i]}: {len(set(eda2_data[attributes3[i]]))}'
            )
    if len(attributes3) > min_length2:
        for i in range(min_length2, len(attributes3)):
            att_table.add_row(
                '', '', f'{attributes3[i]}: {len(set((eda2_data)[attributes3[i]]))}'
            )

        # att_table.add_row(f'{attribute2}: Unique Count = {len(set(data_set[attribute2]))}', 
        # f'{attribute}: Unique Count = {len(set(eda_data[attribute]))}')

    console.print(att_table)

eda2_data = pd.read_csv('data/laptop_data_train.csv', header=0)
eda2_data = eda2_data.drop('Number', axis=1)
eda_data = pd.read_csv(train_set, header=0)
eda_data = eda_data.drop('Number', axis=1)
compare_features(eda2_data, eda_data, train_data)
# compare_features(test_data, eda_test_data)

## Run your Baseline Models to get initial accuracy and loss values
This provides a marker on performance of the model, and allows us to verify whether the feature engineering that follows improves the model's performance or not

### Trivial Model
• A system that always outputs the mean output value 𝑦 from the training set

In [None]:
class TrivialModel:
    def __init__(self, data):
        self.feature_data = data.iloc[:, :-1].copy()
        self.labels = data.iloc[:, -1].copy()
        self.labels = self.labels.round(2)

    def fit(self):
        self.output = self.labels.mean().round(2)

    def predict(self, user_input):
        return self.output

    def RMSE(self, labels, pred):
        mse_value = mean_squared_error(labels, pred)

        # Calculate RMSE
        rmse_value = np.sqrt(mse_value)

        return rmse_value

    def MAE(self, labels, pred):
        mae = mean_absolute_error(labels, pred)

        return mae

    def R2E(self, labels, pred):
        r_squared_value = r2_score(labels, pred).round(2)

        return r_squared_value


def trivial_system(train_data, test_data, split):
    # Shuffle the DataFrame (optional but recommended)
    train_data = train_data.sample(frac=1).reset_index(drop=True)

    # Determine the size of the training set (e.g., 80%)
    train_size = split

    # Split the DataFrame into train and validation sets
    train_df = train_data.iloc[:int(len(train_data) * train_size)]
    val_df = train_data.iloc[int(len(train_data) * train_size):]

    # Optionally, reset the index of the new DataFrames
    train_df.reset_index(drop=True, inplace=True)
    val_df.reset_index(drop=True, inplace=True)

    val_inputs = val_df.iloc[:, :-1].copy()
    val_labels = val_df.iloc[:, -1].copy().round(2)

    test_inputs = test_data.iloc[:, :-1].copy()
    test_labels = test_data.iloc[:, -1].copy().round(2)

    model = TrivialModel(train_df)
    model.fit()

    train_predictions = []
    val_predictions = []
    predictions = []

    for _, data_pt in model.feature_data.iterrows():
        output = model.predict(data_pt)
        train_predictions.append(output)

    console.print(f'Trivial Model\'s Laptop Price Prediction: {output}\n\n')
    
    console.print('Performance on Training Set: \n')
    
    rmse = model.RMSE(model.labels, train_predictions)
    mae = model.MAE(model.labels, train_predictions)
    r2e = model.R2E(model.labels, train_predictions)
    
    att_table = Table()
    att_table.add_column(f'RMSE', style='blue')
    att_table.add_column(f'MAE', style='green')
    att_table.add_column(f'R-squared Error', style='red')

    att_table.add_row(str(rmse), str(mae), str(r2e))

    console.print(att_table)

    for _, data_pt in val_inputs.iterrows():
        output = model.predict(data_pt)
        val_predictions.append(output)

    console.print('Performance on Validation Set: \n')
    
    rmse = model.RMSE(val_labels, val_predictions)
    mae = model.MAE(val_labels, val_predictions)
    r2e = model.R2E(val_labels, val_predictions)
    
    att_table = Table()
    att_table.add_column(f'RMSE', style='blue')
    att_table.add_column(f'MAE', style='green')
    att_table.add_column(f'R-squared Error', style='red')

    att_table.add_row(str(rmse), str(mae), str(r2e))

    console.print(att_table)

    for _, data_pt in test_inputs.iterrows():
        output = model.predict(data_pt)
        predictions.append(output)

    console.print('Performance on Testing Set: \n')
    
    rmse = model.RMSE(test_labels, predictions)
    mae = model.MAE(test_labels, predictions)
    r2e = model.R2E(test_labels, predictions)
    
    att_table = Table()
    att_table.add_column(f'RMSE', style='blue')
    att_table.add_column(f'MAE', style='green')
    att_table.add_column(f'R-squared Error', style='red')

    att_table.add_row(str(rmse), str(mae), str(r2e))

    console.print(att_table)


training_splits = [0.85, 0.8, 0.7, 0.6, 0.5]

for split in training_splits:
    print(f'Training-Validation Split: {split * 100}%')
    trivial_system(train_data, test_data, split)

### 1-Nearest Neighbour Model

In [None]:
def RMSE(labels, pred):
    mse_value = mean_squared_error(labels, pred)

    # Calculate RMSE
    rmse_value = np.sqrt(mse_value)

    return rmse_value

def MAE(labels, pred):
    mae = mean_absolute_error(labels, pred)

    return mae

def R2E(labels, pred):
    r_squared_value = r2_score(labels, pred).round(2)

    return r_squared_value

def nearest_neighbour_system(train_data, test_data, split):
    # Shuffle the DataFrame (optional but recommended)
    train_data = train_data.sample(frac=1).reset_index(drop=True)

    # Determine the size of the training set (e.g., 80%)
    train_size = split

    # Split the DataFrame into train and validation sets
    train_df = train_data.iloc[:int(len(train_data) * train_size)]
    val_df = train_data.iloc[int(len(train_data) * train_size):]

    # Optionally, reset the index of the new DataFrames
    train_df.reset_index(drop=True, inplace=True)
    val_df.reset_index(drop=True, inplace=True)

    train_inputs = train_df.iloc[:, :-1].copy()
    train_labels = train_df.iloc[:, -1].copy()

    val_inputs = val_df.iloc[:, :-1].copy()
    val_labels = val_df.iloc[:, -1].copy().round(2)

    test_inputs = test_data.iloc[:, :-1].copy()
    test_labels = test_data.iloc[:, -1].copy().round(2)

    # Initialize the linear regression model
    model = KNeighborsRegressor(n_neighbors=1)
    # Train the model
    model.fit(train_inputs, train_labels)

    train_predictions = model.predict(train_inputs)
    val_predictions = model.predict(val_inputs)
    predictions = model.predict(test_inputs)

    console.print(f'1-Nearest Neighbour Model\'s Laptop Price Prediction Performance:\n\n')
    
    console.print('Performance on Training Set: \n')
    
    rmse = RMSE(train_labels, train_predictions)
    mae = MAE(train_labels, train_predictions)
    r2e = R2E(train_labels, train_predictions)
    
    att_table = Table()
    att_table.add_column(f'RMSE', style='blue')
    att_table.add_column(f'MAE', style='green')
    att_table.add_column(f'R-squared Error', style='red')

    att_table.add_row(str(rmse), str(mae), str(r2e))

    console.print(att_table)

    console.print('Performance on Validation Set: \n')
    
    rmse = RMSE(val_labels, val_predictions)
    mae = MAE(val_labels, val_predictions)
    r2e = R2E(val_labels, val_predictions)
    
    att_table = Table()
    att_table.add_column(f'RMSE', style='blue')
    att_table.add_column(f'MAE', style='green')
    att_table.add_column(f'R-squared Error', style='red')

    att_table.add_row(str(rmse), str(mae), str(r2e))

    console.print(att_table)

    console.print('Performance on Testing Set: \n')
    
    rmse = RMSE(test_labels, predictions)
    mae = MAE(test_labels, predictions)
    r2e = R2E(test_labels, predictions)
    
    att_table = Table()
    att_table.add_column(f'RMSE', style='blue')
    att_table.add_column(f'MAE', style='green')
    att_table.add_column(f'R-squared Error', style='red')

    att_table.add_row(str(rmse), str(mae), str(r2e))

    console.print(att_table)


training_splits = [0.85, 0.8, 0.7, 0.6, 0.5]

for split in training_splits:
    print(f'Training-Validation Split: {split * 100}%')
    nearest_neighbour_system(train_data, test_data, split)

Feature names must be in the same order as they were in fit.



Feature names must be in the same order as they were in fit.



Feature names must be in the same order as they were in fit.



Feature names must be in the same order as they were in fit.



Feature names must be in the same order as they were in fit.



### Linear Regression Model

In [None]:
def linear_regression_system(train_data, test_data):

    # Define the number of folds for cross-validation
    k = 5  # You can choose any value of k

    # Initialize the KFold splitter
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    # Initialize lists to store evaluation scores
    scores = []

    # Iterate through each fold
    for train_index, val_index in kf.split(train_data):
        # Split data into training and testing sets
        train_df, val_df = train_data.iloc[train_index], train_data.iloc[val_index]

        # Optionally, reset the index of the new DataFrames
        train_df.reset_index(drop=True, inplace=True)
        val_df.reset_index(drop=True, inplace=True)

        train_inputs = train_df.iloc[:, :-1].copy()
        train_labels = train_df.iloc[:, -1].copy()

        val_inputs = val_df.iloc[:, :-1].copy()
        val_labels = val_df.iloc[:, -1].copy().round(2)
        
        # Initialize the linear regression model
        model = LinearRegression()
        # Train the model
        model.fit(train_inputs, train_labels)

        train_predictions = model.predict(train_inputs)
        val_predictions = model.predict(val_inputs)

        console.print(f'Linear Regression Model\'s Laptop Price Prediction Performance:\n\n')        
        console.print('Performance on Training Set: \n')
        
        rmse = RMSE(train_labels, train_predictions)
        mae = MAE(train_labels, train_predictions)
        r2e = R2E(train_labels, train_predictions)
        
        att_table = Table()
        att_table.add_column(f'RMSE', style='blue')
        att_table.add_column(f'MAE', style='green')
        att_table.add_column(f'R-squared Error', style='red')
        att_table.add_row(str(rmse), str(mae), str(r2e))
        console.print(att_table)

        console.print('Performance on Validation Set: \n')
                
        rmse = 0
        mae = 0
        r2e = 0

        rmse = RMSE(val_labels, val_predictions)
        mae = MAE(val_labels, val_predictions)
        r2e = R2E(val_labels, val_predictions)
        
        att_table = Table()
        att_table.add_column(f'RMSE', style='blue')
        att_table.add_column(f'MAE', style='green')
        att_table.add_column(f'R-squared Error', style='red')
        att_table.add_row(str(rmse), str(mae), str(r2e))
        console.print(att_table)
    
        # Evaluate the model
        score = model.score(val_inputs, val_labels)
        scores.append(score)

    # Compute the average score
    average_score = np.mean(scores)
    print("Average score:", average_score)





    # # Shuffle the DataFrame (optional but recommended)
    # train_data = train_data.sample(frac=1).reset_index(drop=True)

    # # Determine the size of the training set (e.g., 80%)
    # train_size = split

    # # Split the DataFrame into train and validation sets
    # train_df = train_data.iloc[:int(len(train_data) * train_size)]
    # val_df = train_data.iloc[int(len(train_data) * train_size):]

    # # Optionally, reset the index of the new DataFrames
    # train_df.reset_index(drop=True, inplace=True)
    # val_df.reset_index(drop=True, inplace=True)

    

    test_inputs = test_data.iloc[:, :-1].copy()
    test_labels = test_data.iloc[:, -1].copy().round(2)

    predictions = model.predict(test_inputs)

    console.print('Performance on Testing Set: \n')
    
    rmse = RMSE(test_labels, predictions)
    mae = MAE(test_labels, predictions)
    r2e = R2E(test_labels, predictions)
    
    att_table = Table()
    att_table.add_column(f'RMSE', style='blue')
    att_table.add_column(f'MAE', style='green')
    att_table.add_column(f'R-squared Error', style='red')
    att_table.add_row(str(rmse), str(mae), str(r2e))
    console.print(att_table)


# training_splits = [0.85, 0.8, 0.7, 0.6, 0.5]

# for split in training_splits:
#     print(f'Training-Validation Split: {split * 100}%')
linear_regression_system(train_data, test_data)

Feature names must be in the same order as they were in fit.



### Baseline Models
• 1NN 
• Linear Regression (no regularization)

### Loss Functions
• Root Mean Squared Error (RMSE) 
• Mean Absolute Error (MAE) 
• R-squared (R2) 

## Normalization of features
Normalize Data (especially numerical features) if that can improve performance

## Feature Importance and Selection
List of techniques to test: `Pearson Correlation, Sequential Feature Selection, PCA, ICA, UFS`

Test the processed dataset to compare the performance of the new model with the initial values. Iterate over different Normalization and feature selection choices until a final model is selected.

## Model 1: Ridge Regression

In [None]:
# Import necessary libraries
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Instantiate the Ridge regressor
alpha = 1.0  # Regularization strength, you can adjust this value
ridge_model = Ridge(alpha=alpha)

# Train the model
ridge_model.fit(X_train, y_train)

# Predict on the test set
y_pred = ridge_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

NameError: name 'X_train' is not defined

## Model 2: SVR

In [None]:
# Import necessary libraries
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

# Instantiate the SVR model
svr_model = SVR(kernel='linear')  # You can choose different kernels like 'rbf', 'poly', etc.

# Train the model
svr_model.fit(X_train, y_train)

# Predict on the test set
y_pred = svr_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

## Model 3: RBF Neural Network

In [None]:
# Import necessary libraries
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import Ridge

# Create an RBF regression pipeline
rbf_features = RBFSampler(gamma=1, random_state=42)
scaler = StandardScaler()
ridge = Ridge(alpha=1.0)
rbf_regressor = make_pipeline(rbf_features, scaler, ridge)

# Train the RBF regression model
rbf_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = rbf_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=9992d859-d1f8-4007-afdd-e66541c42881' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>