In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import datasets
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

## Preliminary Analysis

In [None]:
spotifydf = pd.read_csv('spotify.csv')

In [None]:
print(len(spotifydf))
#169909 rows before drop
print(len(spotifydf.columns))
#19 columns before drop

spotifydf.dropna(axis=0)
print(len(spotifydf))
#same number of rows!
spotifydf.head()

In [None]:
#dropping the spotify id and release data column
spotifydf = spotifydf.drop(['id','release_date'],axis=1)
spotifydf.to_csv('spotifydf.csv')

In [None]:
spotifydf.hist('energy')

In [None]:
spotifydf.hist('danceability')

In [None]:
spotifydf.hist('tempo')

In [None]:
spotifydf.hist('loudness')

In [None]:
spotifydf.plot.scatter(x='year',y='tempo')

In [None]:
spotifydf.plot.scatter(x='year',y='danceability')

## K-Nearest Neighbors

In [None]:
spotify = pd.read_csv('spotifydf.csv')
spotify = spotify[['acousticness','duration_ms','energy','explicit','instrumentalness','liveness','loudness','mode','popularity','speechiness','valence','year']]
spotify

In [None]:
pd.to_numeric(spotify['popularity'])
pd.to_numeric(spotify['acousticness'])
pd.to_numeric(spotify['duration_ms'])
pd.to_numeric(spotify['energy'])
pd.to_numeric(spotify['explicit'])
pd.to_numeric(spotify['instrumentalness'])
pd.to_numeric(spotify['liveness'])
pd.to_numeric(spotify['loudness'])
pd.to_numeric(spotify['mode'])
pd.to_numeric(spotify['speechiness'])
pd.to_numeric(spotify['valence'])
pd.to_numeric(spotify['liveness'])


spotify

In [None]:
df= spotify
df.loc[df['popularity'] < 80, 'popularity'] = 0
df.loc[df['popularity'] >= 80, 'popularity'] = 1
df

In [None]:
y = spotify['popularity']
X = spotify.drop('popularity',axis = 1)

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
#scaling data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#perform knn
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

In [None]:
#predict 
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

## Support Vector Machine

In [None]:
from sklearn.svm import LinearSVR
from sklearn import metrics

In [None]:
spotifydf = pd.read_csv("spotifydf.csv")
spotifydf.head()

In [None]:
popular_bool = []
for i in range(len(spotifydf["popularity"])):
    if spotifydf["popularity"][i] > 80:
        popular_bool.append(1)
    else:
        popular_bool.append(0)

In [None]:
spotifydf["popular_bool"] = popular_bool

In [None]:
X = spotifydf.drop(["Unnamed: 0","artists","name","popular_bool"], axis = 1)
y = spotifydf["popular_bool"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .5)
X.head()

In [None]:
svm_reg = LinearSVR(epsilon=1)
svm_reg.fit(X_train,y_train)
y_pred = svm_reg.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
#print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))

## Logistic Regression

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression

In [None]:
spotify = pd.read_csv('data/spotifydf.csv')
spotify = spotify[[
    'acousticness',
    'danceability',
    'duration_ms',
    'energy',
    'explicit',
    'instrumentalness',
    'key',
    'liveness',
    'loudness',
    'mode',
    'popularity',
    'speechiness',
    'tempo',
    'valence',
    'year'
]]
spotify.loc[spotify['popularity'] < 70, 'popularity'] = 0
spotify.loc[spotify['popularity'] >= 70, 'popularity'] = 1
spotify.head()

In [None]:
def spotify_logistic_regression(Xfeatures, spotify_df=spotify):
    X = spotify_df[Xfeatures]
    Y = spotify_df['popularity']
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
    model = LogisticRegression().fit(X_train,Y_train)
    Y_hat = model.predict(X_test)
    return Y_hat,Y_test

In [None]:
def logistic_regression_plot(columns, spotify_df=spotify):
    y_hat,y_reals = spotify_logistic_regression(columns,spotify_df=spotify_df)
    print("Accuracy: {}".format(accuracy_score(y_hat,y_reals)))
    print(confusion_matrix(y_hat,y_reals))

In [None]:
logistic_regression_plot([
    'acousticness',
    'danceability',
    'duration_ms',
    'energy',
    'explicit',
    'instrumentalness',
    'key',
    'liveness',
    'loudness',
    'mode',
    'speechiness',
    'tempo',
    'valence',
#     'year'
])

In [None]:

logistic_regression_plot([
    'acousticness',
    'danceability',
    'duration_ms',
    'energy',
    'explicit',
    'instrumentalness',
    'key',
    'liveness',
    'loudness',
    'mode',
    'speechiness',
    'tempo',
    'valence',
    'year'
], spotify_df=spotify[spotify['year'] < 1990])

In [None]:
logistic_regression_plot([
#     'acousticness',
    'danceability',
#     'duration_ms',
    'energy',
    'explicit',
#     'instrumentalness',
#     'key',
    'liveness',
#     'loudness',
#     'mode',
    'speechiness',
    'tempo',
#     'valence',
#     'year'
], spotify_df=spotify[spotify['year'] > 2006])