In [1]:
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Data Loading

In [2]:
training_data = np.load('./data/training_data.npy')
prices = np.load('data/prices.npy')

# Data Shuffling

In [3]:
training_data, prices = shuffle(training_data, prices, random_state=0)
training_data[0]

array([2.0170e+03, 1.8351e+04, 2.1900e+01, 6.2400e+02, 3.7480e+01,
       4.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00], dtype=float32)

# Normalization

In [4]:
def normalization(training_data, testing_data):
    std_training = np.std(training_data, axis=0)
    # fuel_type and transimission shouldn't be normalized
    std_training[7:] = 1
    mean = np.mean(training_data)
    new_training_data = np.divide(training_data - mean, std_training)

    new_testing_data = np.divide(testing_data - mean, std_training)
    
    return new_training_data, new_testing_data

# Model

In [5]:
class Model:
    def __init__(self, model):
        self.model = model
    
    def fit(self, train, test):
        self.model.fit(train, test)
        
    def predict(self, test):
        return self.model.predict(test)
    
    def accuracy(self, true, predict):
        return mean_squared_error(true, predict), mean_absolute_error(true, predict)
    
    def tune(self, train, test):
        param_grid = {
            'alpha': [1, 10, 100, 1000]
        }
        
        grid_search = GridSearchCV(estimator=self.model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error')
        grid_search.fit(train, test)
        
        return grid_search.best_params_['alpha']
    
    def getCoefficients(self):
        return self.model.coef_
    
    def getBias(self):
        return self.model.intercept_
    
    def mostSignificantFeature(self):
        features = [
            'Year', 'Killometers_Driven', 'Mileage',
            'Engine', 'Power', 'Seats', 'Owner_Type',
            'FuelType', 'FuelType', 'FuelType', 'FuelType', 'FuelType',
            'Transmission', 'Transmission'
        ]
        coefficients = np.abs(self.getCoefficients())
        
        most_significant_index = np.argmax(coefficients)
        
        most_significant = features[most_significant_index]
        
        return most_significant

    def leastSignificantFeature(self):
        features = [
            'Year', 'Killometers_Driven', 'Mileage',
            'Engine', 'Power', 'Seats', 'Owner_Type',
            'FuelType', 'FuelType', 'FuelType', 'FuelType', 'FuelType',
            'Transmission', 'Transmission'
        ]
        coefficients = np.abs(self.getCoefficients())

        least_significant_index = np.argmin(coefficients)

        least_significant = features[least_significant_index]

        return least_significant

# Split 
- Method: KFold with 3 splits
- Model: LinearRegression

In [6]:
def train_test_model(model_type):
    kf = KFold(n_splits=3)
    model = Model(model_type)
    accuracy = []
    alpha = []
    
    for train_index, test_index in kf.split(training_data):
        X_train, X_test = training_data[train_index], training_data[test_index]
        Y_train, Y_test = prices[train_index], prices[test_index] 
        
        X_train, X_test = normalization(X_train, X_test)
        
        model.fit(X_train, Y_train)
        accuracy.append([*model.accuracy(Y_test, model.predict(X_test))])
    
        if isinstance(model.model, Ridge):
            alpha.append(model.tune(X_train, Y_train))
            
    if isinstance(model.model, Ridge):
        print(f"BEST ALPHA: {np.mean(np.array(alpha))}")
        
    return np.array(accuracy)

In [7]:
# Linear Regression
mean_mse, mean_mae = np.mean(train_test_model(LinearRegression()), axis=0)
print(f"LINEAR\n\tAVERAGE MSE: {mean_mse:.2f}\n\tAVERAGE MAE: {mean_mae:.2f}", end="\n\n")

# Ridge Regression
mean_mse, mean_mae = np.mean(train_test_model(Ridge()), axis=0)
print(f"RIDGE\n\tAVERAGE MSE: {mean_mse:.2f}\n\tAVERAGE MAE: {mean_mae:.2f}", end="\n\n")

# Lasso Regression
mean_mse, mean_mae = np.mean(train_test_model(Lasso()), axis=0)
print(f"LASSO\n\tAVERAGE MSE: {mean_mse:.2f}\n\tAVERAGE MAE: {mean_mae:.2f}", end="\n\n")

LINEAR
	AVERAGE MSE: 3.17
	AVERAGE MAE: 1.32

BEST ALPHA: 100.0
RIDGE
	AVERAGE MSE: 3.17
	AVERAGE MAE: 1.32

LASSO
	AVERAGE MSE: 6.22
	AVERAGE MAE: 1.92


# Coefficient and Bias

In [8]:
model = Model(Ridge(alpha=100))
model.fit(training_data, prices)

print(f"COFFICIENTS: {model.getCoefficients()}\nBIAS: {model.getBias()}")

COFFICIENTS: [ 5.15551627e-01 -3.74514866e-06 -1.01766974e-01  9.83963604e-04
  3.80781442e-02  1.65120900e-01 -1.68956667e-01  0.00000000e+00
  6.68236494e-01 -6.68236554e-01  0.00000000e+00  0.00000000e+00
 -5.44795752e-01  5.44793844e-01]
BIAS: -1035.5355224609375


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


# Most and Least Significant Feature

In [9]:
most_significant = model.mostSignificantFeature()
least_significant = model.leastSignificantFeature()

print(f"MOST: {most_significant}\nLEAST: {least_significant}")

MOST: FuelType
LEAST: FuelType
