In [1]:
import graphviz
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd 
import seaborn as sns
from scipy import misc
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz

### Setup graph palettes

In [2]:
red_blue = ["#19B5FE", "#EF4836"]
palette = sns.color_palette(red_blue)
sns.set_palette(palette)
sns.set_style("white")
%matplotlib inline

### Load Dataset
- Load data into DataFrame object from csv
- Examine overall structure
- Split train / test population

In [3]:
data = pd.read_csv('../input/data.csv')
data.describe()

In [4]:
train, test = train_test_split(data, test_size = 0.30)
print("Training samples: {}; Test samples: {}".format(len(train), len(test)))

### Split Series Data by Sentiment
Sentiment is 'target' value - 1 representing that a song was liked.

In [5]:
pos_tempo = data[data['target'] == 1]['tempo']
neg_tempo = data[data['target'] == 0]['tempo']
pos_dance = data[data['target'] == 1]['danceability']
neg_dance = data[data['target'] == 0]['danceability']
pos_duration = data[data['target'] == 1]['duration_ms']
neg_duration = data[data['target'] == 0]['duration_ms']
pos_loudness = data[data['target'] == 1]['loudness']
neg_loudness = data[data['target'] == 0]['loudness']
pos_speechiness = data[data['target'] == 1]['speechiness']
neg_speechiness = data[data['target'] == 0]['speechiness']
pos_valence = data[data['target'] == 1]['valence']
neg_valence = data[data['target'] == 0]['valence']
pos_energy = data[data['target'] == 1]['energy']
neg_energy = data[data['target'] == 0]['energy']
pos_acousticness = data[data['target'] == 1]['acousticness']
neg_acousticness = data[data['target'] == 0]['acousticness']
pos_key = data[data['target'] == 1]['key']
neg_key = data[data['target'] == 0]['key']
pos_instrumentalness = data[data['target'] == 1]['instrumentalness']
neg_instrumentalness = data[data['target'] == 0]['instrumentalness']

### Sample Histogram for Song Tempo Like / Dislike distribution

In [6]:
fig = plt.figure(figsize=(12, 6))
plt.title("Song Tempo Like/Dislike Distribution")
pos_tempo.hist(alpha=0.7, bins=30, label='positive')
neg_tempo.hist(alpha=0.7, bins=30, label='negative')
plt.legend(loc="upper right")

### Plot sentiment distribution for other features

In [7]:
fig2 = plt.figure(figsize=(15, 15))

# Danceability
ax3 = fig2.add_subplot(331)
ax3.set_xlabel('Danceability')
ax3.set_ylabel('Count')
ax3.set_title("Song Danceability Like Distribution")
pos_dance.hist(alpha=0.5, bins=30)
ax4 = fig2.add_subplot(331)
neg_dance.hist(alpha=0.5, bins=30)


# Duration
ax5 = fig2.add_subplot(332)
pos_duration.hist(alpha=0.5, bins=30)
ax5.set_xlabel('Duration (ms)')
ax5.set_ylabel('Count')
ax5.set_title("Song Duration Like Distribution")
ax6 = fig2.add_subplot(332)
neg_duration.hist(alpha=0.5, bins=30)


# Loudness
ax7 = fig2.add_subplot(333)
pos_loudness.hist(alpha=0.5, bins=30)
ax7.set_xlabel('Loudness')
ax7.set_ylabel('Count')
ax7.set_title("Song Loudness Like Distribution")

ax8 = fig2.add_subplot(333)
neg_loudness.hist(alpha=0.5, bins=30)

# Speechiness
ax9 = fig2.add_subplot(334)
pos_speechiness.hist(alpha=0.5, bins=30)
ax9.set_xlabel('Speechiness')
ax9.set_ylabel('Count')
ax9.set_title("Song Speechiness Like Distribution")

ax10 = fig2.add_subplot(334)
neg_speechiness.hist(alpha=0.5, bins=30)

# Valence
ax11 = fig2.add_subplot(335)
pos_valence.hist(alpha=0.5, bins=30)
ax11.set_xlabel('Valence')
ax11.set_ylabel('Count')
ax11.set_title("Song Valence Like Distribution")

ax12 = fig2.add_subplot(335)
neg_valence.hist(alpha=0.5, bins=30)

# Energy
ax13 = fig2.add_subplot(336)
pos_energy.hist(alpha=0.5, bins=30)
ax13.set_xlabel('Energy')
ax13.set_ylabel('Count')
ax13.set_title("Song Energy Like Distribution")

ax14 = fig2.add_subplot(336)
neg_energy.hist(alpha=0.5, bins=30)

# Key
ax15 = fig2.add_subplot(337)
pos_key.hist(alpha=0.5, bins=30)
ax15.set_xlabel('Key')
ax15.set_ylabel('Count')
ax15.set_title("Song Key Like Distribution")

ax15 = fig2.add_subplot(337)
neg_key.hist(alpha=0.5, bins=30)

# Acousticness
ax16 = fig2.add_subplot(338)
pos_acousticness.hist(alpha=0.5, bins=30)
ax16.set_xlabel('Acousticness')
ax16.set_ylabel('Count')
ax16.set_title("Song Acousticness Like Distribution")

ax16 = fig2.add_subplot(338)
neg_acousticness.hist(alpha=0.5, bins=30)

# Instrumentalness
ax17 = fig2.add_subplot(339)
pos_instrumentalness.hist(alpha=0.5, bins=30)
ax17.set_xlabel('Instrumentalness')
ax17.set_ylabel('Count')
ax17.set_title("Song Instrumentalness Like Distribution")

ax17 = fig2.add_subplot(339)
neg_instrumentalness.hist(alpha=0.5, bins=30)

### Build a simple Decision Tree Classifier based on a set of features

In [8]:
# Could refine min_samples_split, 
c = tree.DecisionTreeClassifier(min_samples_leaf=20, random_state=10)

In [9]:
features = ["valence", "energy", "danceability", "speechiness", "acousticness", "instrumentalness", "loudness","duration_ms","liveness","tempo","time_signature","mode","key"]

X_train = train[features]
y_train = train["target"]

X_test = test[features]
y_test = test["target"]

dt = c.fit(X_train, y_train)

### Run prediction on test data

In [10]:
y_pred = c.predict(X_test)

In [11]:
score = accuracy_score(y_test, y_pred) * 100
rounded_score = round(score, 1)
print("Decision Tree Classifier Accuracy: {}%".format(rounded_score))

In [51]:
hidden_neuron_nums = list(range(15,30))
#[2,3,4,5,6...9, 10, 20, 30, ... 90, 100, 125, 150, 175]
total_performance_records = []
for hn in hidden_neuron_nums:
    c_ = tree.DecisionTreeClassifier(min_samples_leaf=hn, random_state=10)
    perf_records_ = []
    for i in range(10):
         c_.fit(X_train, y_train)
         tst_p_ = c_.predict(X_test)
         performance = np.sum(tst_p_ == y_test) / float(tst_p_.size)
         perf_records_.append(performance)
    total_performance_records.append(np.mean(perf_records_))
    print ("Evaluate hidden layer {} done, accuracy {:.2f}".format(
        hn, total_performance_records[-1]))

### Build Random Forest Classifier

In [12]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 100)
clf.fit(X_train, y_train)

In [13]:
forest_y_pred = clf.predict(X_test)
score = accuracy_score(y_test, forest_y_pred) * 100
rounded_score = round(score, 1)
print("Random Forest (n_est: 100) Accuracy: {}%".format(rounded_score))

### Build Neural Network Classifier

In [14]:
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

In [15]:

# scikit-learn adopts consistent model-construction and training interface, 
# the construction and training of Multi-layer-perceptrons is similar to 
# that of perceptron
mlpc = MLPClassifier()
mlpc.fit(X_train, y_train)


In [16]:
# Please check the .coefs_ as shown in pre-study video
print(type(mlpc.coefs_))
print(len(mlpc.coefs_))
print(type(mlpc.coefs_[0]))
# Before checking this shape, try to guess it
print(mlpc.coefs_[0].shape)
print(type(mlpc.coefs_[1]))
# Before checking this shape, try to guess it
print(mlpc.coefs_[1].shape)

In [17]:
# Applying the model
tst_preds = mlpc.predict(X_test)

# Computing the accuracy
# "==" represents element-wise comparison
# "sum" counts all "True"'s (True:1, False:0)
# ".size" is the total number of predictions
# "float" prevents the ratio to be rounded
# print (np.sum(tst_preds == y_test) / float(tst_preds.size))
score = accuracy_score(y_test, tst_preds) * 100
rounded_score = round(score, 1)
print("Neural Network Classifier Accuracy: {}%".format(rounded_score))

In [18]:
## Stage - 2.
# Set one parameter-#.hidden layer and compute the AVERAGE performance

hn = 10
mlpc_ = MLPClassifier(hidden_layer_sizes=(hn,))
perf_records_ = []
for i in range(10):  
    mlpc_.fit(X_train, y_train)
    tst_p_ = mlpc_.predict(X_test)
    performance = np.sum(tst_p_ == y_test) / float(tst_p_.size)
    perf_records_.append(performance)

print ("#.Hidden Neurons", hn)
print ("Accuracy", perf_records_)
print ("Avg. Accuracy", np.mean(perf_records_))

In [19]:
## Stage-3
# Set one parameter-#.hidden layer to MULTIPLE values and compute the AVERAGE performance

# hidden_neuron_nums = list(range(2,10)) + list(range(10,100,10)) + list(range(100,200,25))
# #[2,3,4,5,6...9, 10, 20, 30, ... 90, 100, 125, 150, 175]
# total_performance_records = []
# for hn in hidden_neuron_nums:
#     mlpc_ = MLPClassifier(hidden_layer_sizes=(hn,))
#     perf_records_ = []
#     for i in range(10):
#          mlpc_.fit(X_train, y_train)
#          tst_p_ = mlpc_.predict(X_test)
#          performance = np.sum(tst_p_ == y_test) / float(tst_p_.size)
#          perf_records_.append(performance)
#     total_performance_records.append(np.mean(perf_records_))
#     print ("Evaluate hidden layer {} done, accuracy {:.2f}".format(
#         hn, total_performance_records[-1]))

****
 ### Build Ada-boost using scikit-learn

In [20]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [21]:
abc = AdaBoostClassifier()
abc.fit(X_train, y_train)
tst_pred = abc.predict(X_test)
print (np.count_nonzero(tst_pred == y_test) / float(y_test.size))

In [43]:
from sklearn.naive_bayes import GaussianNB

features = ["valence", "energy", "danceability", "speechiness", "acousticness", "instrumentalness", "loudness","duration_ms","liveness","tempo","time_signature","mode","key"]

X_train = train[features]
y_train = train["target"]

X_test = test[features]
y_test = test["target"]
clf = GaussianNB()
clf.fit(X_train, y_train)
target_pred = clf.predict(X_test)
accuracy_score(y_test, target_pred, normalize = True)

In [23]:
# features = ["valence", "energy", "danceability", "speechiness", "acousticness", "instrumentalness", "loudness","duration_ms","liveness","tempo","time_signature","mode","key"]
 
# scaled_features = {}
# for each in features:
#     mean, std = features[each].mean(), features[each].std()
#     scaled_features[each] = [mean, std]
#     features.loc[:, each] = (features[each] - mean)/std


# target = data["target"]
# train, test = train_test_split(features,target, test_size = 0.30)

# X_train = train[features]
# y_train = train["target"]

# X_test = test[features]
# y_test = test["target"]

In [41]:
speaker_df = data.groupby('artist').count().reset_index()[['artist', 'target']]
speaker_df.columns = ['artist', 'appearances']
speaker_df = speaker_df.sort_values('appearances', ascending=False)
speaker_df.head(10)

In [39]:
plt.figure(figsize=(15,5))
sns.barplot(x='artist', y='appearances', data=speaker_df.head(50))
plt.show()