# Spotify Analysis - Logistic Regression Model

### We'll fit a logistic regression model to Spotify data, to create a model to predict a songs success. In this model, a song is considered successful if it received more than the average streams of the dataset. A success was denoted with a "1", otherwise it received a "0". This metric was saved into a new column, "song_success" ("y"). The initial features ("X") of the model include various metrics such as danceability, liveliness, energy, etc. After we were able to successfully make predictions with our model, we added more features to the model to improve upon its predictive accuracy. These other metrics included artist count, in spotify charts, in spotify playlists, and bpm. 

In [8]:
# Install hvplot into Google colab notebook
!pip install hvplot



In [9]:
# Import neccessary libraries
from pathlib import Path
import pandas as pd
import hvplot.pandas as plot
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

In [28]:
# Uploaded csv file, then constructed a dataframe from the file
file_path = Path(r"C:/Users/ppate/dm_spotify.csv")
original_df = pd.read_csv(file_path)
original_df

Unnamed: 0,Date,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,11/29/2019,Blinding Lights,The Weeknd,1,2019,11,29,43899,69,3703895074,...,171,C#,Major,50,38,80,0,0,9,7
1,1/6/2017,Shape of You,Ed Sheeran,1,2017,1,6,32181,10,3562543890,...,96,C#,Minor,83,93,65,58,0,9,8
2,11/8/2018,Someone You Loved,Lewis Capaldi,1,2018,11,8,17836,53,2887241814,...,110,C#,Major,50,45,41,75,0,11,3
3,5/10/2019,Dance Monkey,Tones and I,1,2019,5,10,24529,0,2864791672,...,98,F#,Minor,82,54,59,69,0,18,10
4,10/9/2018,Sunflower - Spider-Man: Into the Spider-Verse,"Post Malone, Swae Lee",2,2018,10,9,24094,78,2808096550,...,90,D,Major,76,91,50,54,0,7,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
947,7/14/2023,Overdrive,Post Malone,1,2023,7,14,410,36,14780425,...,140,C#,Major,56,48,73,0,0,35,4
948,6/1/2023,"Gol Bolinha, Gol Quadrado 2","Mc Pedrinho, DJ 900",2,2023,6,1,293,8,11956641,...,133,B,Minor,93,68,65,42,0,12,25
949,7/13/2023,QUEMA,"Sog, Ryan Castro, Peso Pluma",3,2023,7,13,437,31,11599388,...,97,,Major,79,92,89,5,0,6,5
950,12/22/2022,Jhoome Jo Pathaan,"Arijit Singh, Vishal Dadlani, Sukriti Kakar, V...",6,2022,12,22,138,4,1365184,...,105,G,Major,82,62,74,10,0,33,7


In [29]:
# Sort the dataframe by the "Date" column in ascending order
original_df.sort_values('Date', ascending=True, inplace=True)
original_df

Unnamed: 0,Date,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
822,1/1/1930,Agudo Mï¿½ï¿½gi,"Styrx, utku INC, Thezth",3,1930,1,1,323,0,90598517,...,130,F#,Minor,65,49,80,22,4,7,5
373,1/1/1942,White Christmas,"Bing Crosby, John Scott Trotter & His Orchestr...",3,1942,1,1,11940,0,395591396,...,96,A,Major,23,19,25,91,0,40,3
323,1/1/1950,Let It Snow! Let It Snow! Let It Snow!,"Frank Sinatra, B. Swanson Quartet",2,1950,1,1,10585,0,473248298,...,143,D,Major,60,86,32,88,0,34,6
372,1/1/1952,A Holly Jolly Christmas - Single Version,Burl Ives,1,1952,1,1,7930,0,395591396,...,140,,Major,67,81,36,64,0,15,3
209,1/1/1957,Jingle Bell Rock,Bobby Helms,1,1957,1,1,10326,0,741301563,...,119,D,Major,74,78,37,84,0,6,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14,9/9/2016,Say You Won't Let Go,James Arthur,1,2016,9,9,15722,16,2420461338,...,99,A#,Major,40,45,56,69,0,9,5
95,9/9/2021,Shivers,Ed Sheeran,1,2021,9,9,10147,30,1302184087,...,141,D,Major,79,82,86,28,0,4,9
361,9/9/2021,Angel Baby,Troye Sivan,1,2021,9,9,1959,9,408843328,...,145,B,Major,56,41,57,1,0,13,3
544,9/9/2022,Forget Me,Lewis Capaldi,1,2022,9,9,2520,4,239411309,...,102,C#,Minor,67,72,74,30,0,36,4


In [30]:
# Rename dataframe columns to remove unneccessary characters, referencing these
# columns will be easier moving forward 
original_df = original_df.rename(columns={'danceability_%':'danceability',
                                              'valence_%':'valence',
                                              'energy_%':'energy',
                                              'acousticness_%':'acousticness',
                                              'instrumentalness_%':'instrumentalness',
                                              'liveness_%':'liveness',
                                              'speechiness_%':'speechiness'
                                              })
original_df

Unnamed: 0,Date,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,...,bpm,key,mode,danceability,valence,energy,acousticness,instrumentalness,liveness,speechiness
822,1/1/1930,Agudo Mï¿½ï¿½gi,"Styrx, utku INC, Thezth",3,1930,1,1,323,0,90598517,...,130,F#,Minor,65,49,80,22,4,7,5
373,1/1/1942,White Christmas,"Bing Crosby, John Scott Trotter & His Orchestr...",3,1942,1,1,11940,0,395591396,...,96,A,Major,23,19,25,91,0,40,3
323,1/1/1950,Let It Snow! Let It Snow! Let It Snow!,"Frank Sinatra, B. Swanson Quartet",2,1950,1,1,10585,0,473248298,...,143,D,Major,60,86,32,88,0,34,6
372,1/1/1952,A Holly Jolly Christmas - Single Version,Burl Ives,1,1952,1,1,7930,0,395591396,...,140,,Major,67,81,36,64,0,15,3
209,1/1/1957,Jingle Bell Rock,Bobby Helms,1,1957,1,1,10326,0,741301563,...,119,D,Major,74,78,37,84,0,6,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14,9/9/2016,Say You Won't Let Go,James Arthur,1,2016,9,9,15722,16,2420461338,...,99,A#,Major,40,45,56,69,0,9,5
95,9/9/2021,Shivers,Ed Sheeran,1,2021,9,9,10147,30,1302184087,...,141,D,Major,79,82,86,28,0,4,9
361,9/9/2021,Angel Baby,Troye Sivan,1,2021,9,9,1959,9,408843328,...,145,B,Major,56,41,57,1,0,13,3
544,9/9/2022,Forget Me,Lewis Capaldi,1,2022,9,9,2520,4,239411309,...,102,C#,Minor,67,72,74,30,0,36,4


In [31]:
# Check column names of the dataframe, in the next cell we will drop unncessary
# columns
original_df.columns

Index(['Date', 'track_name', 'artist(s)_name', 'artist_count', 'released_year',
       'released_month', 'released_day', 'in_spotify_playlists',
       'in_spotify_charts', 'streams', 'streamsM', 'in_apple_playlists',
       'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts',
       'in_shazam_charts', 'bpm', 'key', 'mode', 'danceability', 'valence',
       'energy', 'acousticness', 'instrumentalness', 'liveness',
       'speechiness'],
      dtype='object')

In [46]:
# Drop several uneeded columns, these columns will not be used as features in X
# for the preliminary model
original_df.drop(["released_year", "released_month", "released_day", 
                  "in_deezer_playlists", "in_deezer_charts",
                  "in_shazam_charts", "key", 'mode',
                  "in_apple_playlists", "in_apple_charts"],
                 axis = 1)
original_df

Unnamed: 0,Date,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,...,key,mode,danceability,valence,energy,acousticness,instrumentalness,liveness,speechiness,song_success
822,1/1/1930,Agudo Mï¿½ï¿½gi,"Styrx, utku INC, Thezth",3,1930,1,1,323,0,90598517,...,F#,Minor,65,49,80,22,4,7,5,
373,1/1/1942,White Christmas,"Bing Crosby, John Scott Trotter & His Orchestr...",3,1942,1,1,11940,0,395591396,...,A,Major,23,19,25,91,0,40,3,
323,1/1/1950,Let It Snow! Let It Snow! Let It Snow!,"Frank Sinatra, B. Swanson Quartet",2,1950,1,1,10585,0,473248298,...,D,Major,60,86,32,88,0,34,6,
372,1/1/1952,A Holly Jolly Christmas - Single Version,Burl Ives,1,1952,1,1,7930,0,395591396,...,,Major,67,81,36,64,0,15,3,
209,1/1/1957,Jingle Bell Rock,Bobby Helms,1,1957,1,1,10326,0,741301563,...,D,Major,74,78,37,84,0,6,3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14,9/9/2016,Say You Won't Let Go,James Arthur,1,2016,9,9,15722,16,2420461338,...,A#,Major,40,45,56,69,0,9,5,
95,9/9/2021,Shivers,Ed Sheeran,1,2021,9,9,10147,30,1302184087,...,D,Major,79,82,86,28,0,4,9,
361,9/9/2021,Angel Baby,Troye Sivan,1,2021,9,9,1959,9,408843328,...,B,Major,56,41,57,1,0,13,3,
544,9/9/2022,Forget Me,Lewis Capaldi,1,2022,9,9,2520,4,239411309,...,C#,Minor,67,72,74,30,0,36,4,


In [34]:
# Create an empty column in the dataframe to hold the predictions of the 
# success of a song, and review the dataframe to confirm the column was 
# added

original_df["song_success"] = np.nan
original_df

Unnamed: 0,Date,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,...,key,mode,danceability,valence,energy,acousticness,instrumentalness,liveness,speechiness,song_success
822,1/1/1930,Agudo Mï¿½ï¿½gi,"Styrx, utku INC, Thezth",3,1930,1,1,323,0,90598517,...,F#,Minor,65,49,80,22,4,7,5,
373,1/1/1942,White Christmas,"Bing Crosby, John Scott Trotter & His Orchestr...",3,1942,1,1,11940,0,395591396,...,A,Major,23,19,25,91,0,40,3,
323,1/1/1950,Let It Snow! Let It Snow! Let It Snow!,"Frank Sinatra, B. Swanson Quartet",2,1950,1,1,10585,0,473248298,...,D,Major,60,86,32,88,0,34,6,
372,1/1/1952,A Holly Jolly Christmas - Single Version,Burl Ives,1,1952,1,1,7930,0,395591396,...,,Major,67,81,36,64,0,15,3,
209,1/1/1957,Jingle Bell Rock,Bobby Helms,1,1957,1,1,10326,0,741301563,...,D,Major,74,78,37,84,0,6,3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14,9/9/2016,Say You Won't Let Go,James Arthur,1,2016,9,9,15722,16,2420461338,...,A#,Major,40,45,56,69,0,9,5,
95,9/9/2021,Shivers,Ed Sheeran,1,2021,9,9,10147,30,1302184087,...,D,Major,79,82,86,28,0,4,9,
361,9/9/2021,Angel Baby,Troye Sivan,1,2021,9,9,1959,9,408843328,...,B,Major,56,41,57,1,0,13,3,
544,9/9/2022,Forget Me,Lewis Capaldi,1,2022,9,9,2520,4,239411309,...,C#,Minor,67,72,74,30,0,36,4,


In [40]:
original_df.dtypes
# Turn the datatype of the "streams" column into a float
#original_df["streams"] = original_df["streams"].str.replace('[^\d]', '', regex=True).astype(float)

Date                     object
track_name               object
artist(s)_name           object
artist_count              int64
released_year             int64
released_month            int64
released_day              int64
in_spotify_playlists      int64
in_spotify_charts         int64
streams                   int64
streamsM                  int64
in_apple_playlists        int64
in_apple_charts           int64
in_deezer_playlists      object
in_deezer_charts          int64
in_shazam_charts         object
bpm                       int64
key                      object
mode                     object
danceability              int64
valence                   int64
energy                    int64
acousticness              int64
instrumentalness          int64
liveness                  int64
speechiness               int64
song_success            float64
dtype: object

In [44]:
# Calculate the average number of streams for each song in the dataframe, and 
# assign it to the variable "average_streams"

average_streams = original_df["streams"].mean()
average_streams_in_millions = original_df["streamsM"].mean()
print(f"The average amount of streams amongst the song in the dataframe is roughly {int(average_streams_in_millions)} million, {int(average_streams)} is the exact number.")

The average amount of streams amongst the song in the dataframe is roughly 514 million, 514137424 is the exact number.


In [49]:
# Loop through the Pandas dataframe and code the conditions of the column 
# "song_success". If a song received more than "average_streams",# it will
# be considered a success denoted by a '1'.

# for index, row in rearranged_df.iterrows():
#     if row["streams"] > average_streams:
#         rearranged_df.loc[index, "song_success"] = 1
#     else:
#         rearranged_df.loc[index, "song_success"] = 0    
# rearranged_df

original_df['song_success'] = (original_df['streams'] > average_streams).astype(int) 
original_df
#filtered_df = original_df[original_df['song_success'] == 1] 
#filtered_df

Unnamed: 0,Date,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,...,key,mode,danceability,valence,energy,acousticness,instrumentalness,liveness,speechiness,song_success
822,1/1/1930,Agudo Mï¿½ï¿½gi,"Styrx, utku INC, Thezth",3,1930,1,1,323,0,90598517,...,F#,Minor,65,49,80,22,4,7,5,0
373,1/1/1942,White Christmas,"Bing Crosby, John Scott Trotter & His Orchestr...",3,1942,1,1,11940,0,395591396,...,A,Major,23,19,25,91,0,40,3,0
323,1/1/1950,Let It Snow! Let It Snow! Let It Snow!,"Frank Sinatra, B. Swanson Quartet",2,1950,1,1,10585,0,473248298,...,D,Major,60,86,32,88,0,34,6,0
372,1/1/1952,A Holly Jolly Christmas - Single Version,Burl Ives,1,1952,1,1,7930,0,395591396,...,,Major,67,81,36,64,0,15,3,0
209,1/1/1957,Jingle Bell Rock,Bobby Helms,1,1957,1,1,10326,0,741301563,...,D,Major,74,78,37,84,0,6,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14,9/9/2016,Say You Won't Let Go,James Arthur,1,2016,9,9,15722,16,2420461338,...,A#,Major,40,45,56,69,0,9,5,1
95,9/9/2021,Shivers,Ed Sheeran,1,2021,9,9,10147,30,1302184087,...,D,Major,79,82,86,28,0,4,9,1
361,9/9/2021,Angel Baby,Troye Sivan,1,2021,9,9,1959,9,408843328,...,B,Major,56,41,57,1,0,13,3,0
544,9/9/2022,Forget Me,Lewis Capaldi,1,2022,9,9,2520,4,239411309,...,C#,Minor,67,72,74,30,0,36,4,0


In [15]:
# Build a logistic regression model. The song will be considered a success if it
# reaches the average number of streams. A successful song will be labeled as 1, 
# and an unsuccessful song as 0
# Seperate the data into labels and features. X will be the features, and y will
# be the label.
X = rearranged_df[["danceability", "valence", "energy", "acousticness", "liveness", "instrumentalness", "speechiness"]]
y = rearranged_df["song_success"]

In [16]:
# Review the X variable Dataframe
X

Unnamed: 0,danceability,valence,energy,acousticness,liveness
0,80,89,83,31,8
1,71,61,74,7,10
2,51,32,53,17,31
3,55,58,72,11,11
4,65,23,80,14,11
...,...,...,...,...,...
948,60,24,39,57,8
949,42,7,24,83,12
950,80,81,67,4,8
951,82,67,77,8,12


In [17]:
# Review the y variable Series
y

0      141381703.0
1      133716286.0
2      140003974.0
3      800840817.0
4      303236322.0
          ...     
948     91473363.0
949    121871870.0
950     73513683.0
951    133895612.0
952     96007391.0
Name: streams, Length: 953, dtype: float64

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.2,
                                                    random_state=1)

In [19]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
regression_model = LogisticRegression(random_state = 1)

# Fit the model using training data
regression_model.fit(X_train, y_train)

# Make a prediction using the testing data
y_predictions = regression_model.predict(X_test)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if xp.any(data != data.astype(int)):


ValueError: Unknown label type: 'continuous'

In [15]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, y_predictions)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


0.0