In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import random

# Regression
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn import svm
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import *
#from sklearn.neighbors import KNeighborsClassifier
from pandas import Series

from utilities.losses import compute_loss
from utilities.optimizers import gradient_descent, pso, mini_batch_gradient_descent
from sklearn.model_selection import train_test_split

# General settings
from utilities.visualization import visualize_train, visualize_test

ModuleNotFoundError: No module named 'utilities'

In [None]:
seed = 309
# Freeze the random seed
random.seed(seed)
np.random.seed(seed)
train_test_split_test_size = 0.3

# Training settings
alpha = 0.1  # step size
max_iters = 50  # max iterations


In [3]:
# Load Data from CSV
# :return: df    a panda data frame
def load_data():
    df = pd.read_csv("../data/Part 1 - regression/diamonds.csv")
    return df
#load_data()

In [4]:
def data_preprocess(data):
    """
    Data preprocess:
        1. Split the entire dataset into train and test
        2. Split outputs and inputs
        3. Standardize train and test
        4. Add intercept dummy for computation convenience
    :param data: the given dataset (format: panda DataFrame)
    :return: train_data       train data contains only inputs
             train_labels     train data contains only labels
             test_data        test data contains only inputs
             test_labels      test data contains only labels
             train_data_full       train data (full) contains both inputs and labels
             test_data_full       test data (full) contains both inputs and labels
    """
    # drop the index attributes
    data = data.drop(data.columns[0], axis=1)
    
    # replace cut column attribute 'Fair','Good','Very Good','Ideal','Premium' to 1 2 3 4 5
    data=data.replace(['Fair','Good','Very Good','Ideal','Premium'],[1,2,3,4,5]);
    # replace colour column attribute 'D','E','F','G','H','I','J' to 7,6,5,4,3,2,1
    data=data.replace(['D','E','F','G','H','I','J'],[7,6,5,4,3,2,1]);
    # replace clarity column attribute 'Fair','Good','Very Good','Ideal','Premium' to 1 2 3 4 5
    data=data.replace(['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF'],[1,2,3,4,5,6,7,8]);
    
    #print(data)
    
    # Split the data into train and test
   
    train_data, test_data = train_test_split(data, test_size = train_test_split_test_size)
    
    # Pre-process data (both train and test)
    train_data_full = train_data.copy()
    train_data = train_data.drop(["price"], axis = 1)
    train_labels = train_data_full["price"]

    test_data_full = test_data.copy()
    test_data = test_data.drop(["price"], axis = 1)
    test_labels = test_data_full["price"]

    #Standardize the inputs
    
    train_mean = train_data.mean()
    train_std = train_data.std()
    train_data = (train_data - train_mean) / train_std
    test_data = (test_data - train_mean) / train_std
    
    #print(train_data)

    return train_data, train_labels, test_data, test_labels, train_data_full, test_data_full


In [5]:
if __name__ == '__main__':
    # Settings
    # Step 1: Load Data
    data = load_data()

    # Step 2: Preprocess the data
    train_data, train_labels, test_data, test_labels, train_data_full, test_data_full = data_preprocess(data)
    
    # Step 3: Learning Start
    
    start_time = datetime.datetime.now()  # Track learning starting time
    print(train_labels.dtypes)
    #Regression
    baseline = LinearRegression()
#     baseline = KNeighborsRegressor(n_neighbors=11)
#     baseline = Ridge();
#     baseline = DecisionTreeRegressor(max_depth=9)
#     baseline = RandomForestRegressor(max_depth=9, random_state=0,n_estimators=500)
#     baseline = GradientBoostingRegressor(n_estimators= 500, max_depth= 9, min_samples_split= 2,learning_rate= 0.15, loss= 'ls')
#     baseline = SGDRegressor()
#     baseline = SVR()
#     baseline = LinearSVR()
#     baseline = MLPRegressor(learning_rate_init=0.2)

    baseline.fit(train_data,train_labels)
    
    #Prediction
    y_pred = baseline.predict(test_data)
    print(y_pred)
    
    end_time = datetime.datetime.now()  # Track learning ending time
    exection_time = (end_time - start_time).total_seconds()  # Track execution time

#   # Step 4: Results presentation
    print("Learn: execution time={t:.3f} seconds".format(t = exection_time))

    print("R2: {:.2f}".format(baseline.score(test_data,test_labels)))  # R2 should be maximize
    mse = mean_squared_error(test_labels, y_pred)
    print("MSE: {:.2f}".format(mse))
    print("RMSE: {:.2f}".format(np.sqrt(mse)))
    print("MAE: {:.2f}".format(mean_absolute_error(test_labels,y_pred)))


int64
[1662.52478923 1362.88105507 4160.22622514 ... 3345.71384124 3606.46924557
 1073.31759633]
Learn: execution time=78.684 seconds
R2: 0.50
MSE: 8164863.24
RMSE: 2857.42
MAE: 1356.06
