# Analysis of music data for the years 1921 to 2020

# Content


The "data.csv" file contains more than 160.000 songs collected from Spotify Web API, and also you can find data grouped by artist, year, or genre in the data section.
More on Spotify audio features, click here
More on other Spotify track features, click here

Primary:
- id (Id of track generated by Spotify)
Numerical:
- acousticness (Ranges from 0 to 1)
- danceability (Ranges from 0 to 1)
- energy (Ranges from 0 to 1)
- duration_ms (Integer typically ranging from 200k to 300k)
- instrumentalness (Ranges from 0 to 1)
- valence (Ranges from 0 to 1)
- popularity (Ranges from 0 to 100)
- tempo (Float typically ranging from 50 to 150)
- liveness (Ranges from 0 to 1)
- loudness (Float typically ranging from -60 to 0)
- speechiness (Ranges from 0 to 1)
- year (Ranges from 1921 to 2020)
Dummy:
- mode (0 = Minor, 1 = Major)
- explicit (0 = No explicit content, 1 = Explicit content)
Categorical:
- key (All keys on octave encoded as values ranging from 0 to 11, starting on C as 0, C# as 1 and so on…)
- artists (List of artists mentioned)
- release_date (Date of release mostly in yyyy-mm-dd format, however precision of date may vary)
- name (Name of the song)

In [314]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import seaborn as sns
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from jupyterthemes import jtplot
import chart_studio
import pandas as pd
from langdetect import detect
import pycountry
import polyglot
from polyglot.detect import Detector
import re

jtplot.style(theme='onedork')

# Working plan for the blog 
    

1. Reorganized the whole dataframe 
2. Identify all the Chinese artists and titles 
3. Replace modes by characters
4. 

In [315]:
import time 

In [316]:
Chinese_eng = """忘了有多久再没听到你 对我说你 最爱的故事 我想了很久 我开始慌了 是不是我又做错了什么 你哭着对我说 童话里都是骗人的 I forgot how long it's been
Since I last heard you Tell me your favorite story
I have thought for a long time
I'm starting to panic
Wondering if I've done something wrong again
"""

for language in Detector(Chinese_eng).languages:
  print(language)

name: engelska    code: en       confidence:  51.0 read bytes:  1093
name: kinesiska   code: zh       confidence:  48.0 read bytes:  1937
name: un          code: un       confidence:   0.0 read bytes:     0


# Main dataset 

In [358]:
Spotify = pd.read_csv(r"C:/Users/SDEGOSSONDEVARENNE/repos/SpotifyBlog/ToySpotify/Spotify/data.csv")

In [359]:
Spotify_feature_list = list(Spotify)

In [360]:
Spotify = pd.read_csv(r"C:/Users/SDEGOSSONDEVARENNE/repos/SpotifyBlog/ToySpotify/Spotify/data.csv")

Spotify['artists'] = Spotify['artists'].map(lambda x: x.lstrip("\'\[").rstrip("\'\]"))
Spotify['artists'] = Spotify['artists'].str.replace(r"\', \'", ",")

Spotify = Spotify.rename(columns = {'name':'Title'})

In [361]:
Spotify[Spotify['artists'].str.contains('Sheeran')]

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,Title,popularity,release_date,speechiness,tempo,valence,year
7226,0.3230,Ed Sheeran,0.422,228600,0.405,0,5Ukzlujip1Slqka5OY82YS,0.000000,6,0.1500,-11.126,1,U.N.I.,56,2011-09-09,0.1010,74.130,0.578,2011
7401,0.2940,Ed Sheeran,0.812,240987,0.542,0,763p5nkQytkOE7xmsFyu8y,0.000069,7,0.1410,-5.403,0,Touch and Go,50,2013,0.0444,110.268,0.733,2013
7768,0.2510,Ed Sheeran,0.838,237333,0.492,0,2pJZ1v8HezrAoZ0Fhzby92,0.000000,1,0.2620,-5.690,0,What Do I Know?,70,2017-03-03,0.0380,115.092,0.895,2017
7846,0.4480,Ed Sheeran,0.747,191147,0.760,0,3Lfiu5sZ4M4B6JaKMBc0FU,0.000000,1,0.1530,-4.294,1,Barcelona,68,2017-03-03,0.1870,99.975,0.682,2017
8002,0.1350,"Ed Sheeran,Eminem,50 Cent",0.852,207760,0.666,1,0AtP8EkGPn6SwxKDaUuXec,0.000000,11,0.7980,-6.923,0,Remember The Name (feat. Eminem & 50 Cent),73,2019-07-12,0.1840,91.046,0.726,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160526,0.8920,Ed Sheeran,0.555,200756,0.257,0,7iBSkXB0pTvZasOLf0Qxk9,0.000001,10,0.1240,-12.637,1,Autumn Leaves - Deluxe Edition,57,2011-09-09,0.0299,89.013,0.431,2011
160743,0.7910,Ed Sheeran,0.618,404480,0.239,0,5puU24G3lHVsUXPAWW2ZpV,0.000000,8,0.1000,-8.371,0,Make It Rain,51,2013,0.0274,96.846,0.327,2013
160787,0.0113,Ed Sheeran,0.806,219840,0.608,0,6gTJaPuj8DT8RjuDJyBgzP,0.000000,1,0.6350,-7.008,1,Don't,51,2013,0.0659,95.049,0.849,2013
161335,0.2140,"Ed Sheeran,Chance the Rapper,PnB Rock",0.746,206187,0.787,1,543bCW2ruMPmxUBWirQ3MR,0.000000,4,0.0669,-6.373,1,Cross Me (feat. Chance the Rapper & PnB Rock),73,2019-05-24,0.1200,95.005,0.607,2019


In [362]:
#import google_trans_new
#from google_trans_new import google_translator  
#detector = google_translator()  
#
#def try_detect(cell):
#    try:
#        detected_lang = detector.detect(cell)
#    except:
#        detected_lang = None
#    return detected_lang
#
#Spotify['Title language'] = Spotify['Title'].apply(try_detect)
#Spotify['Artist name language'] = Spotify['artists'].apply(try_detect)

In [363]:
LIST = Spotify['artists'].unique()
LIST =pd.DataFrame(LIST)

In [364]:
print(LIST)

                                       0
0                        Carl Woitschach
1      Robert Schumann,Vladimir Horowitz
2                    Seweryn Goszczyński
3                       Francisco Canaro
4      Frédéric Chopin,Vladimir Horowitz
...                                  ...
33370                  LEGADO 7,Junior H
33371                    DripReport,Tyga
33372        Leon Bridges,Terrace Martin
33373                     Kygo,Oh Wonder
33374             Cash Cash,Andy Grammer

[33375 rows x 1 columns]


In [365]:
#Spotify_with_origin = pd.read_csv("C:/Users/SDEGOSSONDEVARENNE/repos/SpotifyBlog/ToySpotify/Spotify/Spotify_with_Origin.csv", sep=";")

In [366]:
#Spotify_with_origin1 = Spotify_with_origin[Spotify_with_origin['Title language'].notna() & Spotify_with_origin['Artist name language'].notna()].reset_index()
#from ast import literal_eval
#
#c = ['Artist name language ISO','Artist name language FULL']
#Spotify_with_origin1[c] = pd.DataFrame(Spotify_with_origin1['Artist name language'].map(literal_eval).tolist())#
#
#d = ['Title language ISO','Title language FULL']
#Spotify_with_origin1[d] = pd.DataFrame(Spotify_with_origin1['Title language'].map(literal_eval).tolist())

# Chinese subset 

In [None]:
Spotify = pd.read_csv(r"C:/Users/SDEGOSSONDEVARENNE/repos/SpotifyBlog/ToySpotify/Spotify/data.csv")

Spotify['artists'] = Spotify['artists'].map(lambda x: x.lstrip("\'\[").rstrip("\'\]"))
Spotify['artists'] = Spotify['artists'].str.replace(r"\', \'", ",")

Spotify = Spotify.rename(columns = {'name':'Title'})

In [None]:
#Spotify

In [None]:
import chardet
with open("C:/Users/SDEGOSSONDEVARENNE/repos/SpotifyBlog/ToySpotify/Spotify/ChineseArtists2.txt", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))

# check what the character encoding might be
print(result)

In [None]:
China0 = pd.read_csv("C:/Users/SDEGOSSONDEVARENNE/repos/SpotifyBlog/ToySpotify/Spotify/ChineseArtists2.txt", sep=";", encoding='UTF-16')

In [None]:
China = pd.read_csv("C:/Users/SDEGOSSONDEVARENNE/repos/SpotifyBlog/ToySpotify/Spotify/Chinese_artists.csv", sep=";", encoding='cp1252')

In [None]:
CHINESE = pd.concat([China0,China])

In [None]:
list_chinese_artists = CHINESE['artists'].unique()

In [None]:
Spotify_Chinese = Spotify[Spotify['artists'].isin(list_chinese_artists)]

In [None]:
Spotify_Chinese['Artist origin'] = "China/Chinese (中国/中文)"

# French Subset 

In [None]:
#French = Spotify_with_origin1[Spotify_with_origin1['Title language FULL']=='french']

In [None]:
#FRENCH1ART = FRENCH1.drop_duplicates()
#FRENCH1ART = pd.DataFrame(FRENCH1ART)

In [None]:
#FRENCH1ART.to_csv("C:/Users/SDEGOSSONDEVARENNE/repos/SpotifyBlog/ToySpotify/Spotify/French_artists.csv", sep=";", encoding="utf-8-sig")

In [None]:
French0 = pd.read_csv("C:/Users/SDEGOSSONDEVARENNE/repos/SpotifyBlog/ToySpotify/Spotify/FrenchArtists2.txt", sep=";", encoding='UTF-16')

In [None]:
list_french_artists = French0['artists'].unique()

In [None]:
Spotify_French = Spotify[Spotify['artists'].isin(list_french_artists)]
Spotify_French['Artist origin'] = "France (France)"

In [None]:
Spotify_French

# Swedish subset

In [None]:
#Swedish = Spotify_with_origin1[Spotify_with_origin1['Title language FULL'] == "swedish"]

Swedish music is a harder nut to crack because many artists choose to perform in english. So, what we will need to do is to create a list of swedish artists and bands, a little like we did for Chinese artists. 


In [None]:
with open("C:/Users/SDEGOSSONDEVARENNE/repos/SpotifyBlog/ToySpotify/Spotify/SwedishArtists2.txt", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))

# check what the character encoding might be
print(result)

In [None]:
Swedish0 = pd.read_csv("C:/Users/SDEGOSSONDEVARENNE/repos/SpotifyBlog/ToySpotify/Spotify/SwedishArtists2.txt", sep=";", encoding='Windows-1252')

In [None]:
list_swedish_artists = Swedish0['artists'].unique()

In [None]:
Spotify_Swedish = Spotify[Spotify['artists'].isin(list_swedish_artists)]

In [None]:
Spotify_Swedish['Artist origin'] = "Sweden (Sverige)"

# Korean subset 

In [None]:
#Spotify_with_origin1 = Spotify_with_origin[Spotify_with_origin['Title language'].notna() & Spotify_with_origin['Artist name language'].notna()].reset_index()

In [None]:
#Spotify_with_origin1 = Spotify_with_origin[Spotify_with_origin['Title language'].notna() & Spotify_with_origin['Artist name language'].notna()].reset_index()
#from ast import literal_eval
#
#c = ['Artist name language ISO','Artist name language FULL']
#Spotify_with_origin1[c] = pd.DataFrame(Spotify_with_origin1['Artist name language'].map(literal_eval).tolist())
#
#d = ['Title language ISO','Title language FULL']
#Spotify_with_origin1[d] = pd.DataFrame(Spotify_with_origin1['Title language'].map(literal_eval).tolist())


In [None]:
#d = ['Title language ISO','Title language FULL']
#Spotify_with_origin1[d] = pd.DataFrame(Spotify_with_origin1['Title language'].map(literal_eval).tolist())


In [None]:
with open("C:/Users/SDEGOSSONDEVARENNE/repos/SpotifyBlog/ToySpotify/Spotify/SouthKoreanArtists2.txt", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))

# check what the character encoding might be
print(result)

In [None]:
Korean0 = pd.read_csv("C:/Users/SDEGOSSONDEVARENNE/repos/SpotifyBlog/ToySpotify/Spotify/SouthKoreanArtists2.txt", sep=";", encoding='UTF-16')

In [None]:
list_SouthKorean_artists = Korean0['artists'].unique()

In [None]:
Spotify_SouthKorean = Spotify[Spotify['artists'].isin(list_SouthKorean_artists)]

In [None]:
Spotify_SouthKorean['Artist origin'] = "South Korea/Korean (대한민국/한국어)"

# American subset 

In [None]:
USA0 = pd.read_csv("C:/Users/SDEGOSSONDEVARENNE/repos/SpotifyBlog/ToySpotify/Spotify/USAArtists2.txt", sep=";", encoding='UTF-16')

In [None]:
list_USA_artists = USA0['artists'].unique()

In [None]:
Spotify_USA = Spotify[Spotify['artists'].isin(list_USA_artists)]

In [None]:
Spotify_USA.shape

In [None]:
Spotify_USA['Artist origin'] = "USA"

In [None]:
Spotify_USA[Spotify_USA['artists']=='Ed Sheeran']

# UK Subset

In [None]:
with open("C:/Users/SDEGOSSONDEVARENNE/repos/SpotifyBlog/ToySpotify/Spotify/UKArtists2.txt", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))

# check what the character encoding might be
print(result)

In [None]:
UK0 = pd.read_csv("C:/Users/SDEGOSSONDEVARENNE/repos/SpotifyBlog/ToySpotify/Spotify/UKArtists2.txt", sep=";", encoding='UTF-16')

In [None]:
list_UK_artists = USA0['artists'].unique()

In [None]:
len(list_UK_artists)

In [None]:
Spotify_UK = Spotify[Spotify['artists'].isin(list_UK_artists)]

In [None]:
Spotify_UK['Artist origin'] = "UK"

# Together

In [None]:
dfs = [Spotify_Chinese, Spotify_French, Spotify_SouthKorean, Spotify_USA, Spotify_UK]

In [None]:
SPotify_Reduced = pd.concat(dfs)

In [None]:
SPotify_Reduced.shape

In [None]:
SPotify_Reduced.to_csv("C:/Users/SDEGOSSONDEVARENNE/repos/SpotifyBlog/ToySpotify/Spotify/Spotify_reduced.csv", sep=";", encoding='utf_8_sig')

In [None]:

SPotify_Reduced.loc[(SPotify_Reduced['year'] >= 1920) & (SPotify_Reduced['year'] < 1930), 'Song Decade'] = '1920'
SPotify_Reduced.loc[(SPotify_Reduced['year'] >= 1930) & (SPotify_Reduced['year'] < 1940), 'Song Decade'] = '1930'
SPotify_Reduced.loc[(SPotify_Reduced['year'] >= 1940) & (SPotify_Reduced['year'] < 1950), 'Song Decade'] = '1940'
SPotify_Reduced.loc[(SPotify_Reduced['year'] >= 1950) & (SPotify_Reduced['year'] < 1960), 'Song Decade'] = '1950'
SPotify_Reduced.loc[(SPotify_Reduced['year'] >= 1960) & (SPotify_Reduced['year'] < 1970), 'Song Decade'] = '1960'
SPotify_Reduced.loc[(SPotify_Reduced['year'] >= 1970) & (SPotify_Reduced['year'] < 1980) , 'Song Decade'] = '1970'
SPotify_Reduced.loc[(SPotify_Reduced['year'] >= 1980) & (SPotify_Reduced['year'] < 1990) , 'Song Decade'] = '1980'
SPotify_Reduced.loc[(SPotify_Reduced['year'] >= 1990) & (SPotify_Reduced['year'] < 2000) , 'Song Decade'] = '1990'
SPotify_Reduced.loc[(SPotify_Reduced['year'] >= 2000) & (SPotify_Reduced['year'] < 2010) , 'Song Decade'] = '2000'
SPotify_Reduced.loc[(SPotify_Reduced['year'] >= 2010) & (SPotify_Reduced['year'] < 2020) , 'Song Decade'] = '2010'
SPotify_Reduced.loc[(SPotify_Reduced['year'] >= 2020) & (SPotify_Reduced['year'] < 2030) , 'Song Decade'] = '2020'


In [None]:
SPotify_Reduced['rank'] = SPotify_Reduced.groupby('Artist origin')['popularity'].rank('dense', ascending=True)
SPotify_Reduced['Language Specific Popularity'] = SPotify_Reduced.groupby('Artist origin')['rank'].apply(lambda x: 1+ 100*(x-x.min())/(x.max()-x.min()))
SPotify_Reduced['Language Specific Popularity'] = SPotify_Reduced['Language Specific Popularity'].round(0)

In [None]:
SPotify_Reduced

In [None]:
list(SPotify_Reduced)

In [None]:
columns = ['id', 'Title', 'artists', 'release_date', 'year','Artist origin','Song Decade','Language Specific Popularity']
for col in columns:
  print(f'{col:<15}: {SPotify_Reduced[col].nunique()} unique values')

Spotify = SPotify_Reduced.drop(labels=['id', 'release_date'], axis=1)
#Spotify.shape

In [None]:
numeric_columns = Spotify.columns[Spotify.dtypes != 'object']
string_columns = Spotify.columns[Spotify.dtypes == 'object']
print(f'There are {len(numeric_columns)} numeric columns & {len(string_columns)} string columns')

In [None]:
Spotify_num = pd.DataFrame(data=Spotify, columns=numeric_columns, index=Spotify.index)
corr = np.abs(Spotify_num.corr())
fig, ax = plt.subplots(figsize=(16, 16))
cmap = sns.color_palette("Blues")
sns.heatmap(corr, cmap=cmap, square=True)
plt.title('Correlation between numerical features: abs values')
plt.show()

In [None]:
import numpy as np
series = np.abs(corr['popularity']).sort_values(ascending=False)
print('Feature correlated to popularity by correlation factor:')
for i, row in enumerate(series):
    if 0.2 <= row < 1:
      print(f'{series.index[i]:30} --> {row: .2f} (abs)')

In [None]:

sns_plot = sns.pairplot(Spotify.sample(3000), height=1, vars=['popularity', 'acousticness', 'danceability', 'energy', 'Language Specific Popularity',
                                                              'key', 'liveness', 'loudness', 'speechiness', 'tempo'])
for sns_plot in sns_plot.axes.flat[:1]:
    sns_plot.tick_params(axis='x', labelrotation=90)
plt.show()

### 50 most popular artists in the Spotify dataset

In [None]:
list(Spotify)

In [None]:
#Spotify = pd.read_csv(r"C:/Users/k_sego/repos/TOY_jptNB/Spotify/data.csv")
Spotify = SPotify_Reduced.drop(['explicit','mode'],axis = 1)
Spotify['artists'] = Spotify['artists'].map(lambda x: x.lstrip("\'\[").rstrip("\'\]"))
Spotify['artists'] = Spotify['artists'].str.replace(r"\', \'", ",")

Spotify_2000 = Spotify.loc[Spotify['year'] >= 2000]
Spotify_1990s = Spotify.loc[(Spotify['year'] >= 1990) & (Spotify['year'] < 2000) ]
Spotify_1980s = Spotify.loc[(Spotify['year'] >= 1980) & (Spotify['year'] < 1990) ]

fig, ax1 = plt.subplots(figsize = (12, 10))
lead_artists = Spotify_1990s.groupby('artists')['popularity'].sum().sort_values(ascending=False).head(50)
ax1 = sns.barplot(x=lead_artists.values, y=lead_artists.index, palette="Blues", orient="h", edgecolor='white', ax=ax1)
ax1.set_xlabel('Popularity (Count of presence in the dataset Spotify)', c='w', fontsize=16)
ax1.set_ylabel('Artist', c='w', fontsize=16)
ax1.set_title('30 Most Popular Artists', c='w', fontsize=14, weight = 'bold')
plt.show()

### 50 most popular artists in the Spotify dataset after 2000

In [None]:
fig, ax2 = plt.subplots(figsize = (20, 20))
lead_artists = Spotify_2000.groupby('artists')['popularity'].sum().sort_values(ascending=False).head(50)
ax2 = sns.barplot(x=lead_artists.values, y=lead_artists.index, palette="Greens", orient="h", edgecolor='white', ax=ax2)
ax2.set_xlabel('Popularity (Count of presence in the dataset Spotify_2000)', c='w', fontsize=16)
ax2.set_ylabel('Artist', c='w', fontsize=16)
ax2.set_title('50 least Popular Artists', c='w', fontsize=20, weight = 'bold')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

In [None]:
#fig, ax2 = plt.subplots(figsize = (20, 20))
fig1 = plt.figure(figsize=[30,30])
gs  = gridspec.GridSpec(100,100)

ax1 = fig1.add_subplot(gs[0:45,0:40])
ax2 = fig1.add_subplot(gs[0:45,60:100])
ax3 = fig1.add_subplot(gs[55:100,0:40])
ax4 = fig1.add_subplot(gs[55:100,60:100])

lead_artists1 = Spotify.groupby('artists')['popularity'].sum().sort_values(ascending=False).head(30)
ax1 = sns.barplot(x=lead_artists1.values, y=lead_artists1.index, palette="Blues", orient="h", edgecolor='white', ax=ax1)
ax1.set_xlabel('Popularity All time (Count of presence in the dataset Spotify)', c='w', fontsize=16)
ax1.set_ylabel('Artist', c='w', fontsize=16)
ax1.set_title('30 Most Popular Artists past 100 years', c='w', fontsize=20, weight = 'bold')


lead_artists2 = Spotify_2000.groupby('artists')['popularity'].sum().sort_values(ascending=False).head(30)
ax2 = sns.barplot(x=lead_artists2.values, y=lead_artists2.index, palette="Reds", orient="h", edgecolor='white', ax=ax2)
ax2.set_xlabel('Popularity (Count of presence in the dataset Spotify_2000)', c='w', fontsize=16)
ax2.set_ylabel('Artist', c='w', fontsize=16)
ax2.set_title('30 least Popular Artists 00s', c='w', fontsize=20, weight = 'bold')


lead_artists3 = Spotify_1990s.groupby('artists')['popularity'].sum().sort_values(ascending=False).head(30)
ax3 = sns.barplot(x=lead_artists3.values, y=lead_artists3.index, palette="Greens", orient="h", edgecolor='white', ax=ax3)
ax3.set_xlabel('Popularity 1990s (Count of presence in the dataset Spotify_1980s)', c='w', fontsize=16)
ax3.set_ylabel('Artist', c='w', fontsize=16)
ax3.set_title('30 most Popular Artists 90s', c='w', fontsize=20, weight = 'bold')

lead_artists4 = Spotify_1980s.groupby('artists')['popularity'].sum().sort_values(ascending=False).head(30)
ax4 = sns.barplot(x=lead_artists4.values, y=lead_artists4.index, palette="YlOrBr", orient="h", edgecolor='white', ax=ax4)
ax4.set_xlabel('Popularity 1980s (Count of presence in the dataset Spotify_1980s)', c='w', fontsize=16)
ax4.set_ylabel('Artist', c='w', fontsize=16)
ax4.set_title('30 most Popular Artists 80s', c='w', fontsize=20, weight = 'bold')

#fig1.savefig('C:/Users/k_sego/repos/TOY_jptNB/Graphs/Popularity.png')
plt.show()

Lets do the same but in an interactive plot where decades can be picked

#### Groupping the data by decades 

In [None]:
DecadeAverages = Spotify[['danceability','energy','liveness','acousticness', 'valence','Song Decade','speechiness']].groupby('Song Decade').mean().sort_values(by='Song Decade').reset_index()
DecadeAverages

In [None]:
plt.figure(figsize=(14,8))
plt.title("Trends Over Time", fontsize=15)
lines = ['danceability','energy','liveness','acousticness', 'valence','speechiness']
for line in lines:
    ax = sns.lineplot(x='Song Decade', y=line, data=DecadeAverages)
plt.legend(lines)

In [None]:
DecadeAverages_melted = DecadeAverages.melt(id_vars='Song Decade')
DecadeAverages_melted

In [None]:
plt.figure(figsize=(20,10))
plt.title("Trends Over Time", fontsize=12)
sns.lineplot(x='Song Decade', y='value', hue='variable', data=DecadeAverages_melted)

In [None]:
Spotify.artists.value_counts()[:40]
Spotify['popularity'].mean()
artists = Spotify.artists.value_counts().index[:40]


In [None]:
Spotify

In [None]:
Spotify_artists = Spotify[Spotify.artists.isin(artists)][['artists','Song Decade','energy']].groupby(['artists','Song Decade']).count().reset_index()
Spotify_artists.rename(columns={'energy':'count'}, inplace=True)
Spotify_artists

In [None]:
plt.figure(figsize=(20,20))
sns.lineplot(x='Song Decade', y='count', hue='artists', data=Spotify_artists)

In [None]:
Shaped_Spotify = pd.DataFrame(np.zeros((10,40)), columns=artists)
Shaped_Spotify['Song Decade'] = np.arange(1920,2020,10)
print(Shaped_Spotify.shape)
Shaped_Spotify = Shaped_Spotify.melt(id_vars='Song Decade',var_name='artists', value_name='count')
print(Shaped_Spotify.shape)
Shaped_Spotify


In [None]:
Spotify

In [None]:
music_keys = {0: 'C', 1: 'C#,Db', 2: 'D', 3: 'D#,Eb', 4: 'E', 5: 'F', 6: 'F#,Gb', 7: 'G', 8: 'G#,Ab', 9: 'A', 10: 'A#,Bb', 11: 'B'}
Spotify['music_key'] = Spotify['key'].map(music_keys)



In [None]:
Spotify['music_key']

In [None]:
plt.figure(figsize=(20,20))
sns.countplot(x = 'music_key', data=Spotify , order=Spotify['music_key'].value_counts().index)
plt.title("Count songs by keys")
plt.show()

In [None]:
import dash
import dash_core_components as dcc
import dash_html_components as html
import plotly.graph_objs as go

In [None]:
list(Spotify)

Basically, what we want is to find several categorical variables to plot popularity by. BUT, it ought to be the language specific popularity which is plotted. 

In [None]:
Spotify['Artist_origin'] = Spotify['Artist origin']
Spotify['Song_Decade'] = Spotify['Song Decade']

In [None]:
Spotify_g = Spotify.groupby(['artists','Artist_origin','Song_Decade','music_key'])['popularity','Language Specific Popularity'].sum().reset_index()

In [None]:
Spotify = Spotify_g.sort_values(['Song_Decade'])

In [None]:
key_options          = Spotify["music_key"].astype(str).unique()
language_options     = Spotify["Artist_origin"].astype(str).unique()
decade_options       = Spotify["Song_Decade"].astype(str).unique()

In [None]:
colors = {
    'background': '#111111',
    'text': '#7FDBFF'
}

In [None]:
app = dash.Dash()
#app.layout = html.Div(html.H1('Heading', style={'backgroundColor':'blue'})
app.layout = html.Div([
    html.H2("Popularity of songs"),
    html.Div(
        [
            dcc.Dropdown(
                id="music_key",
                options=[{
                    'label': i,
                    'value': i
                } for i in key_options],
                value='All music keys'
            ),
            dcc.Dropdown(
                id="Artist_origin",
                options=[{
                    'label': i,
                    'value': i
                } for i in language_options],
                value='All origins'
             ),
            dcc.Dropdown(
                id="Song_Decade",
                options=[{
                    'label': i,
                    'value': i
                } for i in decade_options],
                value='All Decades'
            ),
        ],
        style={'width': '25%',
               'display': 'inline-block'}),
     dcc.Graph(id='funnel-graph'),
    ])

@app.callback(
    dash.dependencies.Output('funnel-graph', 'figure'),
    [dash.dependencies.Input('music_key','value'),
    dash.dependencies.Input('Artist_origin','value'),
    dash.dependencies.Input('Song_Decade','value')])
    
def update_graph(music_key,Artist_origin,Song_Decade):
    if ((music_key == "All music keys") & (Artist_origin == "All origins") & (Song_Decade == "All Decades")):
        Spotify_plot = Spotify.copy()
    
    elif ((music_key == "All music keys") & (Song_Decade == "All Decades")):
        Spotify_plot = Spotify[(Spotify['Artist_origin'].astype(str)==Artist_origin)]
        
    elif ((music_key == "All music keys") & (Artist_origin == "All origins")):
        Spotify_plot = Spotify[(Spotify['Song_Decade'].astype(str)==Song_Decade)]
    
    elif ((Song_Decade == "All Decades") & (Artist_origin == "All origins")):
        Spotify_plot = Spotify[(Spotify['music_key']==music_key)]
                             
    elif ((music_key == "All music keys")):
        Spotify_plot = Spotify[(Spotify['Song_Decade'].astype(str)==Song_Decade) & (Spotify['Artist_origin'].astype(str)==Artist_origin)]
    
    elif ((Song_Decade == "All Decades")):
        Spotify_plot = Spotify[(Spotify['music_key']==music_key) & (Spotify['Artist_origin'].astype(str)==Artist_origin)]
                                            
    elif ((Artist_origin == "All origins")):
        Spotify_plot = Spotify[(Spotify['music_key']==music_key) & (Spotify['Song_Decade'].astype(str)==Song_Decade)]
    
    elif ((music_key != "All music keys") & (Song_Decade != "All Decades") & (Artist_origin != "All origins")):
        Spotify_plot = Spotify[(Spotify['music_key']==music_key) & (Spotify['Song_Decade'].astype(str)==Song_Decade) & (Spotify['Artist_origin'].astype(str)==Artist_origin)]
        
    trace1 = go.Bar(x=Spotify_plot['artists'], y=Spotify_plot['Language Specific Popularity'], name='Language Specific Popularity',marker_color='pink')
    trace2 = go.Bar(x=Spotify_plot['artists'], y=Spotify_plot['popularity'], name='Global Popularity',marker_color='black')

    return {
        'data': [trace1, trace2],
        'layout':
        go.Layout(
            title='Artist popularity from {} during {}. Song in {} key'.format(Artist_origin,Song_Decade,music_key),
            barmode ='group',plot_bgcolor='rgb(30, 215, 96)',paper_bgcolor='rgb(30, 215, 96)')
    }


if __name__ == '__main__':
    app.run_server()

In [29]:
Years = Subset_spotify_full['year'].unique()
#Artist = Subset_spotify_full['artists'].unique()
min_year  = min(Years)
max_year = max(Years)

NameError: name 'Subset_spotify_full' is not defined

### evolution of genre popularity over time

In [None]:
#Spotify = pd.read_csv(r"C:/Users/k_sego/repos/TOY_jptNB/Spotify/data.csv")
df_2 = pd.read_csv(r"C:/Users/k_sego/repos/TOY_jptNB/Spotify/data_w_genres.csv")
df = pd.read_csv(r"C:/Users/k_sego/repos/TOY_jptNB/Spotify/data_by_genres.csv")

from tqdm.notebook import tqdm

def str2list(x):
    try:
        return ast.literal_eval(x)
    except:
        return np.nan

In [None]:
out_cols = ["genres", "artists", "mode", "count", "key"]
in_cols = [x for x in df.columns if x not in out_cols] 

df = df.set_index("genres")[in_cols].drop("[]", 0)
df #genre data

#fill nan values by 0
df_2.set_index("artists", inplace=True)
#dfmi.loc[:, ('one', 'second')]
df_2["genres"][df_2["genres"] == "[]"] = np.nan
#df_2["genres"][df_2["genres"] == "[]"] = np.nan
df_2["genres"] = df_2["genres"].fillna(0)
df_2



In [None]:
df_2

In [None]:


#standardize data
df_2_std = df_2.copy()
for col in in_cols:
    df_2_std[col] = (df_2[col]-df_2[col].mean())/df_2[col].std()
       

In [None]:
df_2_std

In [None]:

#extract individual genres from genre lists
df_2_std.reset_index(inplace = True)
collist = list(df_2_std.columns)
new_rows = []
for index in tqdm(range(len(df_2_std))):
    row = df_2_std.iloc[index]
    genre_list = str2list(row["genres"])
    row = pd.DataFrame(row).transpose()
    if(not(isinstance(genre_list, list) and len(genre_list) != 0)):
        pass
    else:
        if(len(genre_list) == 1):
            row["genres"] = genre_list[0]
            new_rows.append(list(row.values[0]))
        else:
            row = pd.concat([row for i in range(len(genre_list))], 0)
            row["genres"] = genre_list
            for i in range(len(genre_list)):
                new_rows.append(list(row.values[i]))
                
df_known = pd.DataFrame(new_rows, columns = collist)
#export

df_known.to_csv("data_each_genres.csv")
print(df_known)

In [None]:
#fill nan values by 0
SPotify_genre.set_index("artists", inplace=True)
SPotify_genre["genres"][SPotify_genre["genres"] == "[]"] = np.nan
SPotify_genre["genres"] = SPotify_genre["genres"].fillna(0)
SPotify_genre


#standardize data
df_2_std = df_2.copy()
for col in in_cols:
    df_2_std[col] = (df_2[col]-df_2[col].mean())/df_2[col].std()
       
#extract individual genres from genre lists
df_2_std.reset_index(inplace = True)
collist = list(df_2_std.columns)
new_rows = []
for index in tqdm(range(len(df_2_std))):
    row = df_2_std.iloc[index]
    genre_list = str2list(row["genres"])
    row = pd.DataFrame(row).transpose()
    if(not(isinstance(genre_list, list) and len(genre_list) != 0)):
        pass
    else:
        if(len(genre_list) == 1):
            row["genres"] = genre_list[0]
            new_rows.append(list(row.values[0]))
        else:
            row = pd.concat([row for i in range(len(genre_list))], 0)
            row["genres"] = genre_list
            for i in range(len(genre_list)):
                new_rows.append(list(row.values[i]))
                
df_known = pd.DataFrame(new_rows, columns = collist)

In [None]:
df_known

In [None]:
fig, ax = plt.subplots(figsize = (15, 3))
stat = Spotify.groupby('count')['mean'].mean().to_frame().reset_index()
ax = stat.plot(x='count', y='mean', marker='.', linestyle = '', ax=ax)
ax.set_xlabel('Count of appearances in data', fontsize=12, c='r')
ax.set_ylabel('Mean Popularity', fontsize=12, c='r')
plt.show()

## PCA on Spotify

Let's play with a smaller dataset that also includes information on the genre. 

In [None]:
Spotify_key      = pd.read_csv(r"C:/Users/k_sego/repos/TOY_jptNB/Spotify/spotify2.csv", sep=";", encoding= "iso-8859-1")
numeric_columns  = Spotify_key.columns[Spotify_key.dtypes != 'object']
string_columns   = Spotify_key.columns[Spotify_key.dtypes == 'object']
Spotify_Num      = pd.DataFrame(data=Spotify_key, columns=numeric_columns, index=Spotify_key.index)

In [None]:
Spotify_Num      = pd.DataFrame(data=Spotify_key, columns=numeric_columns, index=Spotify_key.index)
Spotify_target   = Spotify_key['Top Genre']

In [None]:
def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

In [None]:
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [None]:
Spotify_key["Top Genre"] = pd.Categorical(Spotify_key["Top Genre"])
Spotify_key["Top Genre"] = Spotify_key["Top Genre"].cat.codes

In [None]:

Spotify_genre = Spotify_key.drop(['Index','Length (Duration)', 'Title','Artist','Year',], axis = 1)

In [None]:
Spotify_genre = normalize(Spotify_genre)

In [None]:
cols = Spotify_genre.shape[1]
Spotify_genre_data = Spotify_genre.values[:, 0:(cols-1)]
Spotify_genre_category = Spotify_genre.values[:,0]
Spotify_genre_data.astype(float)


In [None]:
# Number of clusters
k = 12

# Number of training data
n = Spotify_genre_data.shape[0]
# Number of features in the data
c = Spotify_genre_data.shape[1]

# Generate random centers, here we use sigma and mean to ensure it represent the whole data
mean = np.mean(Spotify_genre_data, axis = 0)
std = np.std(Spotify_genre_data, axis = 0)
centers = np.random.randn(k,c)*std + mean


In [None]:
colors=['orange', 'blue', 'green','red','black', 'yellow']
for i in range(n):
    plt.scatter(Spotify_genre_data[i, 0], Spotify_genre_data[i,1], s=7, color = colors[int(Spotify_genre_category[i])])
plt.scatter(centers[:,0], centers[:,1], marker='*', c='w', s=150)

In [None]:
range(n)

In [None]:
n

In [None]:
mylist= list(range(1,322))
test1 = pd.DataFrame({'score': mylist})
from scipy.stats import rankdata
import numpy as numpy

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from ipywidgets import interactive

Spotify_full         = pd.read_csv(r"C:/Users/k_sego/repos/TOY_jptNB/Spotify/data.csv")
Spotify_with_genres  = pd.read_csv(r"C:/Users/k_sego/repos/TOY_jptNB/Spotify/data_w_genres.csv")
Spotify_by_genre     = pd.read_csv(r"C:/Users/k_sego/repos/TOY_jptNB/Spotify/data_by_genres.csv")

In [None]:
Spotify_full

In [None]:
#Spotify_full['Year'] = pd.to_datetime(Spotify_full.year, format='%Y')

In [None]:
Subset_spotify_full = Spotify_full[['artists','year', 'energy','key','acousticness','popularity']]
Subset_spotify_full['artists'] = Subset_spotify_full['artists'].map(lambda x: x.lstrip("\'\[").rstrip("\'\]"))
Subset_spotify_full['artists'] = Subset_spotify_full['artists'].str.replace(r"\', \'", ",")

In [None]:
lst_col = 'genres'

def explode(df, lst_cols, fill_value='', preserve_index=False):
    # make sure `lst_cols` is list-alike
    if (lst_cols is not None
        and len(lst_cols) > 0
        and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)
    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()
    # preserve original index values    
    idx = np.repeat(df.index.values, lens)
    # create "exploded" DF
    res = (pd.DataFrame({
                col:np.repeat(df[col].values, lens)
                for col in idx_cols},
                index=idx)
             .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
                            for col in lst_cols}))
    # append those rows that have empty lists
    if (lens == 0).any():
        # at least one list in cells is empty
        res = (res.append(df.loc[lens==0, idx_cols], sort=False)
                  .fillna(fill_value))
    # revert the original index order
    res = res.sort_index()
    # reset index if requested
    if not preserve_index:        
        res = res.reset_index(drop=True)
    return res

In [None]:
list(Spotify)

In [None]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [None]:
Decades = Spotify['Song Decade'].unique()
#Artist = Subset_spotify_full['artists'].unique()
min_Decades  = min(Decades)
max_Decades = max(Decades)

In [None]:
max_Decades

In [None]:
year = widgets.IntSlider(
    value=1960,
    min=min_Decades,
    max=max_Decades,
    step=10,
    description='Decade:',
    continuous_update=False
)

use_date = widgets.Checkbox(
    description='Song Decade: ',
    value=True,
)

container = widgets.HBox(children=[use_date, year])

textbox = widgets.Dropdown(
    description='Popularity:',
    value=5,
    options=Spotify['popularity'].unique().tolist()
)

origin = widgets.Dropdown(
    options=list(Spotify['key'].unique()),
    value=5,
    description='Key:',
)


# Assign an empty figure widget with two traces
trace1 = go.Scatter(x=Spotify['energy'], opacity=0.75, name='Energy')
trace2 = go.Scatter(x=Spotify['acousticness'], opacity=0.75, name='Acousticness')
g = go.FigureWidget(data=[trace1, trace2],
                    layout=go.Layout(
                        title=dict(
                            text='Spotify Datset'
                        ),
                        barmode='overlay'
                    ))

In [None]:
def validate():
    if origin.value in Spotify['key'].unique():
        return True
    else:
        return False


def response(change):
    if validate():
        if use_date.value:
            filter_list = [i and j and k for i, j, k in
                           zip(Spotify['Song Decade'] == year.value, Spotify['popularity']== textbox.value,
                               Spotify['key']== origin.value)]
            temp_df = Spotify[filter_list]

        else:
            filter_list = [i and j for i, j in
                           zip(Spotify['popularity']==textbox.value,  Spotify['key'] == origin.value)]
            temp_df = Spotify[filter_list]
        x1 = temp_df['energy']
        x2 = temp_df['acousticness']
        with g.batch_update():
            g.data[0].x = x1
            g.data[1].x = x2
            g.layout.barmode = 'overlay'
            g.layout.xaxis.title = 'Energy'
            g.layout.yaxis.title = 'Value'


origin.observe(response, names="value")
textbox.observe(response, names="value")
year.observe(response, names="value")
use_date.observe(response, names="value")

In [None]:
container2 = widgets.HBox([origin, textbox])
widgets.VBox([container,
              container2,
              g])


