In [1]:
# MO444 2s/2017 - First assignment
#
#         Group 05
#
# - Anderson Rossanez (124136)
# - Bruno Branta Lopes (31470)
#

# Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Functions
def load_data(filename):
    raw_data = open(filename, 'rt')
    data = np.loadtxt(raw_data, delimiter=",")
    return data

def split_Y_X(dataset):
    Y = dataset[:,0]
    X = dataset[:,1:]
    return Y, X

In [2]:
# load training data
data = load_data('year-prediction-msd-train.txt')

print('Training data size: {}'.format(len(data)))

Training data size: 463715


In [3]:
# Take a look at the data (5 first instances):
# - first column (index 0) is the song release year
# - remaining columns are the the features (indexes 1 - 90)
data_frame = pd.DataFrame(data)
data_frame.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,90
0,2004,46.50128,-18.37096,58.89824,27.28181,1.93097,-2.43167,18.64843,-5.62769,-1.47711,...,43.94716,-64.21148,13.04284,156.03785,-2.4267,51.71977,-43.56703,10.63735,24.08962,-21.41886
1,2005,41.46015,-52.3014,-4.59825,-19.28084,-11.85844,-19.54192,1.30306,-1.83185,5.98469,...,-13.48435,-154.65715,35.22429,89.53649,-3.13145,-128.2812,89.97277,-15.88139,-75.21074,-0.51139
2,2009,42.18667,-85.65863,-7.91506,-11.84193,-12.61959,-21.66749,4.97883,-11.8675,9.87342,...,42.08584,-181.77349,43.00181,87.94977,-13.70362,136.49979,140.32633,11.51422,382.79589,29.98269
3,1987,46.90244,19.86314,10.92119,4.87136,-41.17499,-19.84156,2.93308,-5.98711,3.05997,...,-2.76407,-20.31782,-75.24506,125.81801,-15.50828,-63.31002,-142.21937,-12.36699,32.45911,-17.14909
4,2009,46.64388,-81.99503,41.62851,3.65855,-9.38201,-13.51749,7.48781,-7.03302,7.06982,...,12.29244,-143.25348,91.15842,-227.85481,7.75916,-41.32376,-225.66526,-4.05081,455.39458,41.6531


In [4]:
# split train data into training/validation (80/20)
train_data, validation_data = train_test_split(data, test_size=0.2)

print('Train data size: {}'.format(len(train_data)))
print('Validation data size: {}'.format(len(validation_data)))

Train data size: 370972
Validation data size: 92743


In [5]:
# load test data
test_data = load_data('year-prediction-msd-test.txt')

print('Test data size: {}'.format(len(test_data)))

Test data size: 36285


In [6]:
# Split label (years) and features
train_data_Y, train_data_X = split_Y_X(train_data)
validation_data_Y, validation_data_X = split_Y_X(validation_data)
test_data_Y, test_data_X = split_Y_X(test_data)

In [7]:
# Create a baseline linear regression model and train it
lr_base_model = LinearRegression()
lr_base_model.fit(train_data_X, train_data_Y)

print('Number of coefficients: {}'.format(len(lr_base_model.coef_)))
print('Coefficients: {}'.format(lr_base_model.coef_))

Number of coefficients: 90
Coefficients: [  8.69173858e-01  -5.57296220e-02  -4.36559430e-02   4.52886772e-03
  -1.50833969e-02  -2.19801946e-01  -5.61995222e-03  -9.97464049e-02
  -6.83511618e-02   2.41968169e-02  -1.68286781e-01  -2.30332728e-03
   4.82442134e-02   3.51301142e-04  -4.31797823e-04   6.13460795e-04
   4.29723747e-04   1.39298428e-03   1.99278197e-03   2.16509995e-03
   6.97093760e-04  -4.73229057e-04   7.61441798e-03   2.56612448e-03
  -3.60206692e-03   3.41194493e-05   1.62588512e-03   4.03100997e-04
   9.00942807e-04  -2.15254470e-04  -1.20194456e-03  -1.47238318e-03
  -5.62595703e-03   2.53637807e-03   1.93672497e-03  -5.16795969e-03
  -2.49886475e-04   6.74450770e-04   1.47398952e-03  -1.77690581e-03
  -1.96442504e-03  -7.92792336e-04  -1.33163673e-03  -1.95691027e-03
  -3.22577161e-03   6.48437263e-03   4.59190380e-04  -2.01136901e-03
   2.19206721e-04   2.04229284e-03   2.29082764e-04  -1.90719584e-03
   1.94414419e-03  -1.25604309e-04  -2.79413664e-04   1.960665

In [8]:
# A quick comparison on some instances of the validation data with the predicted values...
print('Actual: {}'.format(validation_data_Y[0:5]))
print('Predicted: {}'.format(lr_base_model.predict(validation_data_X)[0:5]))

# ... and the errors
mean_sq_error = mean_squared_error(validation_data_Y, lr_base_model.predict(validation_data_X))
print('Mean squared error: {}'.format(mean_sq_error))
print('Root mean squared error: {}'.format(sqrt(mean_sq_error)))

Actual: [ 2008.  2006.  1995.  2001.  2005.]
Predicted: [ 2007.21805379  2001.86583498  1988.19555054  2004.03527608  1998.036657  ]
Mean squared error: 91.4544261522
Root mean squared error: 9.56318075497


In [9]:
# LR-based alternative #1: Data normalization
lr_normalized_model = LinearRegression(fit_intercept=True, normalize=True)

lr_normalized_model.fit(train_data_X, train_data_Y )

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [10]:
# Check the model errors
mean_sq_error = mean_squared_error(validation_data_Y, lr_normalized_model.predict(validation_data_X))
print('Mean squared error: {}'.format(mean_sq_error))
print('Root mean squared error: {}'.format(sqrt(mean_sq_error)))

Mean squared error: 91.4544261522
Root mean squared error: 9.56318075497
