# Imports and Instantiation

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

import matplotlib.pyplot as plt
import re
from rich import print
from rich.table import Table
from rich.console import Console
import seaborn as sns
from tabulate import tabulate

console = Console()

# Loading the Data



In [None]:
# Defining the file paths of the train and test datasets
train_set = "data/final_train_data.csv"
test_set = "data/final_test_data.csv"

# Reading the Datasets into their respective Pandas Dataframes
train_data = pd.read_csv(train_set, header=0)
test_data = pd.read_csv(test_set, header=0)

# Dropping the 'Number' Column as it is surplus to requirements
train_data = train_data.drop('Number', axis=1)
test_data = test_data.drop('Number', axis=1)

# Storing all the features in a list of attributes
attributes = train_data.columns.tolist()

# Reorder the columns in test_data to match the column order of train_data
test_data = test_data[attributes]

In [None]:
def standardize_features(data, features_to_standardize):
    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Fit the scaler to the selected features and transform them
    scaled_features = scaler.fit_transform(data[features_to_standardize])

    # Create a copy of the original data to retain non-standardized features
    scaled_data = data.copy()

    # Replace the selected features with the standardized values
    scaled_data.loc[:, features_to_standardize] = scaled_features

    return scaled_data

features_to_standardize = []
for column_name in train_data.columns:
    if 'one_hot' not in column_name and column_name != 'Price':
        features_to_standardize.append(column_name)

standardized_train_data = standardize_features(train_data, features_to_standardize)
standardized_test_data = standardize_features(test_data, features_to_standardize)

In [None]:
# styled_df = train_data.style
styled_df = train_data.style

# Apply various formatting options
styled_df = styled_df.set_properties(**{'text-align': 'center'})  # Align text to center
styled_df = styled_df.set_table_styles([{'selector': 'th', 'props': [('font-size', '11pt')]}])  # Set font size for headers

# Display the styled DataFrame

styled_df

Unnamed: 0,one_hot_Company_Acer,one_hot_Company_Apple,one_hot_Company_Asus,one_hot_Company_Chuwi,one_hot_Company_Dell,one_hot_Company_Fujitsu,one_hot_Company_Google,one_hot_Company_HP,one_hot_Company_Huawei,one_hot_Company_LG,one_hot_Company_Lenovo,one_hot_Company_MSI,one_hot_Company_Mediacom,one_hot_Company_Microsoft,one_hot_Company_Razer,one_hot_Company_Samsung,one_hot_Company_Toshiba,one_hot_Company_Vero,one_hot_Company_Xiaomi,one_hot_TypeName_2 in 1 Convertible,one_hot_TypeName_Gaming,one_hot_TypeName_Netbook,one_hot_TypeName_Notebook,one_hot_TypeName_Ultrabook,one_hot_TypeName_Workstation,Inches,Display,Touchscreen,ScreenArea,CpuModel,ClockSpeed,Ram,MemoryComponent1,MemoryComponent2,Gpu,one_hot_OpSys_Android,one_hot_OpSys_Chrome OS,one_hot_OpSys_Linux,one_hot_OpSys_Mac OS X,one_hot_OpSys_No OS,one_hot_OpSys_Windows 10,one_hot_OpSys_Windows 10 S,one_hot_OpSys_Windows 7,one_hot_OpSys_macOS,Weight,Price
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,13.3,8,0,4096000,61,2.3,8,14,0,70,0,0,0,0,0,0,0,0,1,1.37,71378.6832
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,13.3,1,0,1296000,61,1.8,8,12,0,43,0,0,0,0,0,0,0,0,1,1.34,47895.5232
2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,15.6,3,0,2073600,43,2.5,8,17,0,45,0,0,0,0,1,0,0,0,0,1.86,30636.0
3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,15.4,8,0,5184000,75,2.7,16,19,0,81,0,0,0,0,0,0,0,0,1,1.83,135195.336
4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,13.3,8,0,4096000,61,3.1,8,17,0,72,0,0,0,0,0,0,0,0,1,1.37,96095.808
5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,15.6,1,0,1049088,25,3.0,4,9,0,14,0,0,0,0,0,1,0,0,0,2.1,21312.0
6,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,15.4,8,0,5184000,75,2.2,16,18,0,76,0,0,0,1,0,0,0,0,0,2.04,114017.6016
7,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,13.3,1,0,1296000,61,1.8,8,18,0,43,0,0,0,0,0,0,0,0,1,1.34,61735.536
8,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,14.0,3,0,2073600,52,1.8,16,19,0,41,0,0,0,0,0,1,0,0,0,1.3,79653.6
9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,14.0,5,0,2073600,45,1.6,8,17,0,51,0,0,0,0,0,1,0,0,0,1.6,41025.6


In [None]:
def sequential_feature_selection(train_data, num_features):
    # Initialize the KNN model
    model = KNeighborsRegressor(n_neighbors=1)

    # Perform Sequential Forward Selection (SFS)
    sfs = SequentialFeatureSelector(model, n_features_to_select=num_features, scoring='neg_mean_squared_error', cv=3)
    sfs.fit(train_data.iloc[:, :-1], train_data.iloc[:, -1])

    # Get selected features
    selected_features = train_data.columns[sfs.get_support(indices=True)]

    # Append the last column of the original dataset to selected features
    selected_features = list(selected_features) + [train_data.columns[-1]]

    # Return the modified dataset with selected features
    return selected_features

In [None]:
def ufs_feature_selection(train_data, n_features):
    # Initialize UFS
    selector = SelectKBest(score_func=f_regression, k=n_features)

    # Fit UFS to the training data
    selector.fit(train_data.iloc[:, :-1], train_data.iloc[:, -1])

    # Get selected features based on UFS
    selected_features = train_data.columns[selector.get_support(indices=True)]

    # Append the last column of the original dataset to selected features
    selected_features = list(selected_features) + [train_data.columns[-1]]

    return selected_features

# Error Calculation

In [None]:
def RMSE(labels, pred):
    mse_value = mean_squared_error(labels, pred)

    # Calculate RMSE
    rmse_value = np.sqrt(mse_value).round(4)

    return rmse_value

def MAE(labels, pred):
    mae = mean_absolute_error(labels, pred).round(4)

    return mae

def R2E(labels, pred):
    r_squared_value = r2_score(labels, pred).round(4)

    return r_squared_value

### Linear Regression Model

In [None]:
def linear_regression_system(train_data, test_data):

    # Define the number of folds for cross-validation
    k = 5  # You can choose any value of k

    # Initialize lists to store error values for each fold
    train_rmse_list, train_mae_list, train_r2e_list = [], [], []
    val_rmse_list, val_mae_list, val_r2e_list = [], [], []

    # Initialize the KFold splitter
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    # Iterate through each fold
    for train_index, val_index in kf.split(train_data):
        # Split data into training and testing sets
        train_df, val_df = train_data.iloc[train_index], train_data.iloc[val_index]

        # Optionally, reset the index of the new DataFrames
        train_df.reset_index(drop=True, inplace=True)
        val_df.reset_index(drop=True, inplace=True)

        train_inputs = train_df.iloc[:, :-1].copy()
        train_labels = train_df.iloc[:, -1].copy()

        val_inputs = val_df.iloc[:, :-1].copy()
        val_labels = val_df.iloc[:, -1].copy().round(2)
        
        # Initialize the linear regression model
        model = LinearRegression()
        # Train the model
        model.fit(train_inputs, train_labels)

        train_predictions = model.predict(train_inputs)
        val_predictions = model.predict(val_inputs)

        # Find the range of values in train_labels
        min_value = train_labels.min()
        max_value = train_labels.max()

        # Add tolerance of 20000 to the range
        min_value -= 5000
        max_value += 5000

        # Calculate the mean of all values in train_labels
        mean_value = train_labels.mean()

        # Replace values in val_predictions outside of the range (with tolerance) with the mean value
        val_predictions = np.where((val_predictions < min_value) | (val_predictions > max_value), mean_value, val_predictions)

        # Calculate error metrics for training set
        train_rmse = np.sqrt(mean_squared_error(train_labels, train_predictions))
        train_mae = mean_absolute_error(train_labels, train_predictions)
        train_r2e = r2_score(train_labels, train_predictions)

        # Calculate error metrics for validation set
        val_rmse = np.sqrt(mean_squared_error(val_labels, val_predictions))
        val_mae = mean_absolute_error(val_labels, val_predictions)
        val_r2e = r2_score(val_labels, val_predictions)

        # Append error values to lists
        train_rmse_list.append(train_rmse)
        train_mae_list.append(train_mae)
        train_r2e_list.append(train_r2e)

        val_rmse_list.append(val_rmse)
        val_mae_list.append(val_mae)
        val_r2e_list.append(val_r2e)

    # Compute mean and standard deviation of error metrics for training and validation sets
    train_rmse_mean = np.mean(train_rmse_list)
    train_rmse_std = np.std(train_rmse_list)
    train_mae_mean = np.mean(train_mae_list)
    train_mae_std = np.std(train_mae_list)
    train_r2e_mean = np.mean(train_r2e_list)
    train_r2e_std = np.std(train_r2e_list)

    val_rmse_mean = np.mean(val_rmse_list)
    val_rmse_std = np.std(val_rmse_list)
    val_mae_mean = np.mean(val_mae_list)
    val_mae_std = np.std(val_mae_list)
    val_r2e_mean = np.mean(val_r2e_list)
    val_r2e_std = np.std(val_r2e_list)

    # Print mean and std deviation of error metrics in rich tables
    console.print("[blue]Mean and Standard Deviation of Error Metrics over 5 Folds:[/blue]\n")
    console.print("[blue]Training Set:[/blue]")
    
    att_table = Table(title="Error Metrics", show_header=True, header_style="bold magenta")
    att_table.add_column("Metric", style="cyan", justify="center")
    att_table.add_column("Mean", style="cyan", justify="center")
    att_table.add_column("Std Dev", style="cyan", justify="center")
    
    att_table.add_row("RMSE", f"{train_rmse_mean:.4f}", f"{train_rmse_std:.4f}")
    att_table.add_row("MAE", f"{train_mae_mean:.4f}", f"{train_mae_std:.4f}")
    att_table.add_row("R-squared Error", f"{train_r2e_mean:.4f}", f"{train_r2e_std:.4f}")

    console.print(att_table)

    console.print("\n[blue]Validation Set:[/blue]")
    
    att_table = Table(title="Error Metrics", show_header=True, header_style="bold magenta")
    att_table.add_column("Metric", style="cyan", justify="center")
    att_table.add_column("Mean", style="cyan", justify="center")
    att_table.add_column("Std Dev", style="cyan", justify="center")
    
    att_table.add_row("RMSE", f"{val_rmse_mean:.4f}", f"{val_rmse_std:.4f}")
    att_table.add_row("MAE", f"{val_mae_mean:.4f}", f"{val_mae_std:.4f}")
    att_table.add_row("R-squared Error", f"{val_r2e_mean:.4f}", f"{val_r2e_std:.4f}")

    console.print(att_table)

linear_regression_system(train_data, test_data)

In [None]:
linear_regression_system(standardized_train_data, standardized_test_data)

In [None]:
selected_features = sequential_feature_selection(standardized_train_data, 40)
train_data_selected = standardized_train_data[selected_features].copy()
test_data_selected = standardized_test_data[selected_features].copy()
linear_regression_system(train_data_selected, test_data_selected)

In [None]:
selected_features = ufs_feature_selection(standardized_train_data, 28)
train_data_selected = standardized_train_data[selected_features].copy()
test_data_selected = standardized_test_data[selected_features].copy()
linear_regression_system(train_data_selected, test_data_selected)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=9992d859-d1f8-4007-afdd-e66541c42881' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>