In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/top50spotify2019/top50.csv', encoding='latin1')

In [None]:
df

## Potential questions/observations to find insights

### How many words in the song titles?
### How long has each artist been mainstream?
### How many songs in the top 50 does each artist have?
### Where are the artists from?
### Do songs tend towards relatively high BPM or low BPM?
### Scatter plot between BPM and energy
### Scatter plot between BPM and Danceability
### Scatter plot between BPM and Valence
### Proportion of songs above/below a Liveness value of 25
### Descriptive statistics for length
### Proportion of songs with Acousticness above/below value of 50

# --------------------------------------------------------------------------------------------------

### How many words in the song titles?

In [None]:
#We count each word in each song title not including words surrounded by brackets or following a dash

In [None]:
df['words_in_title'] = 0
for i in range(0,50):
    word_count = 0
    words = df['Track.Name'][i].split()
    for word in words:
            if '(' in word:
                break
            elif '-'in word:
                break
            else:
                word_count += 1
    df['words_in_title'][i] = word_count


In [None]:
df

In [None]:
df.columns

The general tendency seems to be towards fewer words, which makes sense given that these are titles. It is difficult using this data to estimate whether the title length contributes to the success of the track, or if that tendency of fewer words in titles may be a general trend that does not impact success.

### How many years has this artist been known(mainstream)?

In [None]:
#These values are researched from Google and entered into 'val' manually
val = [6,4,9,9,5,9,1,8,1,1,4,7,1,6,10,15,11,5,1,1,1,1,1,1,1,6,1,1,9,15,5,9,8,7,12,7,6,9,15,1,6,14,12,3,3,5,6,5,5,9]
df['Mainstream'] = 0
for i in range(0,50):
    #val = input('How many years has ' + df['Artist.Name'][i] + ' been mainstream?')
    df['Mainstream'][i] = val[i]

In [None]:
df

The numbers here seem to reflect a large group of artists who are newly popular (a year or less), or have been popular atleast 5 years or more. There seem to be only a few instances of artists between 1 and 5 years. These observations may indicate that most artists are popular either by introducing a new style, which may or may not remain popular, or they are popular by having established a following and using their own unique artistic model which has brought them success over a long period.

### How many songs in the top 50 does each artist have?

In [None]:
ap = 0
ap = df.groupby('Artist.Name')['Unnamed: 0'].nunique()
print(ap)

It is not surprising to see that the greatest proportion of artists only have one track that made the list, given the huge number of diverse and popular artists in the world. Artists who have two tracks in the list show that certain artists understand the formula to create contemporary music which has a mass appeal. It is astonishing to see artists like Ed Sheeran, in this case, who has 4 tracks in this list. 

### What proportion of artists are American? European? Other parts of the world?

In [None]:
#We manually enter the origin of each artist, researched on Google
origin = ['NA','LA','NA','EU','NA','EU','NA','EU','NA','NA','LA','EU','EU','LA','NA','NA','LA','NA','NA','NA','EU','NA','LA','LA','NA','NA','NA','NA','LA','LA','NA','NA','LA','NA','NA','EU','LA','EU','NA','NA','EU','NA','NA','NA','LA','NA','LA','NA','NA','EU']
df['Origin'] = 0
for i in range(0,50):
    #origin = input('Where is ' + df['Artist.Name'][i] + ' from?')
    df['Origin'][i] = origin[i]
df

In [None]:
na = 0
la = 0
eu = 0
for i in range(0,50):
    if(df['Origin'][i] == 'NA'):
        na+=1
    elif(df['Origin'][i] == 'LA'):
        la+=1
    else:
        eu+=1
slices = [na/50,la/50,eu/50]
lbl = ['North America','Latin America', 'Europe']

plt.pie(slices, labels=lbl)
print('North America: '+str(na/50))
print('Latin America: '+str(la/50))
print('Europe: '+str(eu/50))

![](https://1z1euk35x7oy36s8we4dr6lo-wpengine.netdna-ssl.com/wp-content/uploads/2018/12/spotify-subscribers-by-region.png)

Image Source (Spotify): https://www.businessofapps.com/data/spotify-statistics/

We see that North America (specifically the United States) has a disproportionately large number of artists in this list (56%), far surpassing Europe and Latin America, even though Europe has a greater representation of subscribers, and latin america is not far behind the US in that value. It is difficult to determine the cause for this inconsistency, but we see here that this inconsistency exists clearly. Perhaps the US has a greater market for music production, whereas Europe has a greater market for music consumption.

### Do songs tend towards faster or slower BPM?

In [None]:
maximum = 0
minimum = 500
full_range = 0
middle = 0

for i in range(0,50):
    if df['Beats.Per.Minute'][i] < minimum:
        minimum = int(df['Beats.Per.Minute'][i]) 
    elif df['Beats.Per.Minute'][i] > maximum:
        maximum = int(df['Beats.Per.Minute'][i])
full_range = maximum - minimum
middle = full_range//2 + minimum

In [None]:
fast = 0
slow = 0
for i in range(0,50):
    if df['Beats.Per.Minute'][i] > middle:
        fast += 1
    else:
        slow += 1
print('Maximum BPM: '+str(maximum))
print('Middle BPM: ' + str(middle))
print('Minimum BPM: '+str(minimum))
print('Proportion of fast BPM tracks (above 137BPM): ' + str(fast/50))
print('Proportion of slow BPM tracks (137BPM or less): ' + str(slow/50))

The middle of the range between the max and min BPM is calculated to be 137BPM. We see that 26% percent of tracks are above the middle, while 74% of of the tracks are below that value. This insight can be combined with insights from the following research article: https://www.frontiersin.org/articles/10.3389/fpsyg.2018.02118/full. Please explore the article for a proper understanding, however the article explains in part that higher BPM elicits more positive emotions in listeners, whereas lower BPM music elicits more negative emotions. Although specific BPM ranges are not specified in the article, it is interesting to consider the implications about the listeners of the top 50 tracks.

### Scatter plot between BPM and energy of track

In [None]:
BPM = df['Beats.Per.Minute']
energy = df['Energy']

In [None]:
BPM_energy = np.column_stack((BPM,energy))
kmeans1 = KMeans(n_clusters = 3)
kmeans1.fit(BPM_energy)
y_kmeans1 = kmeans1.predict(BPM_energy)

In [None]:
plt.scatter(BPM,energy)
plt.xlabel('Beats per minute')
plt.ylabel('Energy')
centers = kmeans1.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)

The gray cluster centers help identify that there are certain areas with a large proportion of points. This may indicate a correlation between BPM and energy of a track.

### Scatter plot between BPM and Danceability of track

In [None]:
dncblt = df['Danceability']

In [None]:
bpm_dncblt = np.column_stack((BPM,dncblt))
kmeans2 = KMeans(n_clusters = 3)
kmeans2.fit(bpm_dncblt)
y_kmeans1 = kmeans2.predict(bpm_dncblt)

In [None]:
plt.scatter(BPM,dncblt)
plt.xlabel('Beats per minute')
plt.ylabel('Danceability')
centers2 = kmeans2.cluster_centers_
plt.scatter(centers2[:, 0], centers2[:, 1], c='black', s=200, alpha=0.5)

The gray cluster centers help identify that there are certain areas with a large proportion of points, this indicates a possible correlation between BPM and Danceability.

### Scatter plot between BPM and Valence of track

##### Note: A higher valence value indicates a positive mood for the track

In [None]:
valence = df['Valence.']

In [None]:
bpm_valence = np.column_stack((BPM,valence))
kmeans3 = KMeans(n_clusters=2)
kmeans3.fit(bpm_valence)
y_kmeans3 = kmeans3.predict(bpm_valence)

In [None]:
plt.scatter(BPM,valence)
plt.xlabel('Beats per minute')
plt.ylabel('Valence')
centers3 = kmeans3.cluster_centers_
plt.scatter(centers3[:, 0], centers3[:, 1], c='black', s=200, alpha=0.5)

Although there are some potential clusters in the above graph, it would be difficult to say that there is a clear correlation between BPM and valence.

### Proportion of songs above/below a Liveness value of 25

In [None]:
high_live = 0
low_live = 0
for i in range(0,50):
    if df['Liveness'][i] > 25:
        high_live+=1
    elif df['Liveness'][i] < 26:
        low_live+=1
print('Proportion of tracks likely recorded live is: ' +str(high_live/50))
print('Proportion of tracks likely not produced live is: ' +str(low_live/50))

### Descriptive statistics for length

In [None]:
df['Length.'].describe()

Most of the tracks appear to have a length between 160 and 240 seconds long. So most of the songs are approximately between 2 minutes 30 seconds and 4 minutes. We can speculate that these numbers reflect a track length which is not too short to prevent the producer from fleshing out the theme of the track, but not too long to lose the audience. It is possible that the artistic choice of the producer regarding length of the track contributes to it's success.

### Proportion of songs with Acousticness above/below value of 50

In [None]:
high_acou = 0
low_acou = 0
for i in range(0,50):
    if df['Acousticness..'][i] > 50:
        high_acou+=1
    elif df['Acousticness..'][i] < 51:
        low_acou+=1
print('Proportion of songs which we can consider acoustic: '+str(high_acou/50))
print('Proportion of songs which we cannot consider acoustic: '+str(low_acou/50))