In [193]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB

In [194]:
f = open("FINAL_cleaned_output.csv")
f.readline()
d = np.loadtxt(f, delimiter=",", usecols = (0, 1, 4, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18))

In [195]:
d1 = pd.DataFrame(data=d)
d1.columns = ["index","rank","year","energy","liveness","tempo","speechiness","acousticness","instrumentalness","timesig","danceability","key","duration","loudness","valence"]
print d1.describe()

             index         rank         year       energy     liveness  \
count  1274.000000  1274.000000  1274.000000  1274.000000  1274.000000   
mean    785.591052    50.484301  2007.361068     0.690180     0.177648   
std     467.282818    29.357217     4.662524     0.165599     0.135908   
min       0.000000     1.000000  2000.000000     0.016400     0.021000   
25%     375.250000    24.000000  2003.000000     0.584000     0.091500   
50%     784.500000    51.000000  2007.000000     0.709500     0.124000   
75%    1192.750000    76.000000  2011.000000     0.816750     0.225750   
max    1598.000000   100.000000  2015.000000     0.987000     0.854000   

             tempo  speechiness  acousticness  instrumentalness      timesig  \
count  1274.000000  1274.000000   1274.000000       1274.000000  1274.000000   
mean    120.143451     0.102686      0.157085          0.011261     3.982732   
std      28.494104     0.098734      0.199296          0.079076     0.253212   
min      57.1

In [196]:
x = pd.DataFrame(data=d1, columns=["energy", "liveness", "tempo", "speechiness", "acousticness", "instrumentalness", "danceability", "key", "loudness", "valence"]) 
y = pd.DataFrame(data=d1, columns=["rank"]) 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

Due to the non-boolean/non-binarized nature of the features, an array of threshold points must be passed into the BernoulliNB instance. These threshold points will be the mean values of each feature. For example, a song "has" the energy quality if its energy value is higher than the average energy of all songs in the data.

In [197]:
clf = BernoulliNB(binarize=[0.690180, 0.177648, 120.143451, 0.102686, 0.157085, 0.011261, 0.650279, 5.140502, -5.906388, 0.545966])
clf.fit(x_train, y_train)

BernoulliNB(alpha=1.0,
      binarize=[0.69018, 0.177648, 120.143451, 0.102686, 0.157085, 0.011261, 0.650279, 5.140502, -5.906388, 0.545966],
      class_prior=None, fit_prior=True)

Now to test on the test dataset.

In [198]:
diff = 0 
predictions = clf.predict(x_test)
for idx in range(y_test.size):
    test = y_test.iloc[idx][0]
    predicted = predictions[idx]
    print "test data value: ", test
    print "predicted value: ", predicted
    diff = diff + abs(test-predicted)
print "average difference ", diff / (y_test.size)

test data value:  7.0
predicted value:  62.0
test data value:  33.0
predicted value:  81.0
test data value:  85.0
predicted value:  84.0
test data value:  73.0
predicted value:  59.0
test data value:  22.0
predicted value:  96.0
test data value:  66.0
predicted value:  91.0
test data value:  14.0
predicted value:  58.0
test data value:  47.0
predicted value:  83.0
test data value:  72.0
predicted value:  62.0
test data value:  63.0
predicted value:  85.0
test data value:  52.0
predicted value:  63.0
test data value:  97.0
predicted value:  96.0
test data value:  10.0
predicted value:  60.0
test data value:  70.0
predicted value:  58.0
test data value:  44.0
predicted value:  62.0
test data value:  17.0
predicted value:  95.0
test data value:  18.0
predicted value:  63.0
test data value:  33.0
predicted value:  58.0
test data value:  30.0
predicted value:  10.0
test data value:  4.0
predicted value:  62.0
test data value:  29.0
predicted value:  83.0
test data value:  6.0
predicted valu

On average, this Bayesian model predicts ranks 34 places off.

In [199]:
x = pd.DataFrame(data=d1, columns=["energy", "acousticness", "loudness"]) 
y = pd.DataFrame(data=d1, columns=["rank"]) 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
clf = BernoulliNB(binarize=[0.690180, 0.157085, -5.906388])
clf.fit(x_train, y_train)

BernoulliNB(alpha=1.0, binarize=[0.69018, 0.157085, -5.906388],
      class_prior=None, fit_prior=True)

In [187]:
diff = 0 
predictions = clf.predict(x_test)
for idx in range(y_test.size):
    test = y_test.iloc[idx][0]
    predicted = predictions[idx]
    print "test data value: ", test
    print "predicted value: ", predicted
    diff = diff + abs(test-predicted)
print "average difference ", diff / (y_test.size)

test data value:  82.0
predicted value:  53.0
test data value:  33.0
predicted value:  2.0
test data value:  79.0
predicted value:  81.0
test data value:  43.0
predicted value:  81.0
test data value:  68.0
predicted value:  58.0
test data value:  34.0
predicted value:  69.0
test data value:  95.0
predicted value:  2.0
test data value:  29.0
predicted value:  2.0
test data value:  66.0
predicted value:  58.0
test data value:  7.0
predicted value:  81.0
test data value:  45.0
predicted value:  56.0
test data value:  4.0
predicted value:  69.0
test data value:  27.0
predicted value:  2.0
test data value:  97.0
predicted value:  2.0
test data value:  70.0
predicted value:  73.0
test data value:  27.0
predicted value:  73.0
test data value:  69.0
predicted value:  73.0
test data value:  16.0
predicted value:  69.0
test data value:  32.0
predicted value:  73.0
test data value:  22.0
predicted value:  73.0
test data value:  93.0
predicted value:  69.0
test data value:  68.0
predicted value:  

On average, this Bayesian model predicts still ranks 28 places off. Thus, neither of these models was particularly good.