In [1]:
# Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Functions
def load_data(filename):
    raw_data = open(filename, 'rt')
    data = np.loadtxt(raw_data, delimiter=",")
    return data

def split_Y_X(dataset):
    Y = dataset[:,0]
    X = dataset[:,1:]
    return Y, X

In [2]:
# load training data
data = load_data('year-prediction-msd-train.txt')

print('Training data size: {}'.format(len(data)))

Training data size: 463715


In [3]:
# Take a look at the data (5 first instances):
# - first column (index 0) is the song release year
# - remaining columns are the the features (indexes 1 - 90)
data_frame = pd.DataFrame(data)
data_frame.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,90
0,2004.0,46.50128,-18.37096,58.89824,27.28181,1.93097,-2.43167,18.64843,-5.62769,-1.47711,...,43.94716,-64.21148,13.04284,156.03785,-2.4267,51.71977,-43.56703,10.63735,24.08962,-21.41886
1,2005.0,41.46015,-52.3014,-4.59825,-19.28084,-11.85844,-19.54192,1.30306,-1.83185,5.98469,...,-13.48435,-154.65715,35.22429,89.53649,-3.13145,-128.2812,89.97277,-15.88139,-75.21074,-0.51139
2,2009.0,42.18667,-85.65863,-7.91506,-11.84193,-12.61959,-21.66749,4.97883,-11.8675,9.87342,...,42.08584,-181.77349,43.00181,87.94977,-13.70362,136.49979,140.32633,11.51422,382.79589,29.98269
3,1987.0,46.90244,19.86314,10.92119,4.87136,-41.17499,-19.84156,2.93308,-5.98711,3.05997,...,-2.76407,-20.31782,-75.24506,125.81801,-15.50828,-63.31002,-142.21937,-12.36699,32.45911,-17.14909
4,2009.0,46.64388,-81.99503,41.62851,3.65855,-9.38201,-13.51749,7.48781,-7.03302,7.06982,...,12.29244,-143.25348,91.15842,-227.85481,7.75916,-41.32376,-225.66526,-4.05081,455.39458,41.6531


In [4]:
# load test data
test_data = load_data('year-prediction-msd-test.txt')

print('Test data size: {}'.format(len(test_data)))

Test data size: 36285


In [5]:
# Split label (years) and features
data_Y, data_X = split_Y_X(data)
test_data_Y, test_data_X = split_Y_X(test_data)

In [6]:
# Create a baseline linear regression model
lr_base_model = LinearRegression()

# Train the model using the training sets
lr_base_model.fit(data_X, data_Y)

print('Number of coefficients: {}'.format(len(lr_base_model.coef_)))

Number of coefficients: 90


In [7]:
# A quick comparison on some instances of the test data with the predicted values...
print('Actual: {}'.format(test_data_Y[0:5]))
print('Predicted: {}'.format(lr_base_model.predict(test_data_X)[0:5]))

# ... and the errors
mean_sq_error = mean_squared_error(test_data_Y, lr_base_model.predict(test_data_X))
print('Mean squared error: {}'.format(mean_sq_error))
print('Root mean squared error: {}'.format(sqrt(mean_sq_error)))

Actual: [ 1989.  1989.  1987.  2004.  2001.]
Predicted: [ 1994.07731053  1990.52789291  1997.35218131  1997.84193563  2004.58195583]
Mean squared error: 91.2582355867
Root mean squared error: 9.55291764785
