In [1]:
import pandas as pd
pd.options.display.max_rows = 13

In [2]:
# data from https://www.kaggle.com/leonardopena/top50spotify2019
df = pd.read_csv('top50.csv', encoding='iso-8859-1', index_col=0)

# MetaData

In [3]:
df.columns

Index(['Track.Name', 'Artist.Name', 'Genre', 'Beats.Per.Minute', 'Energy',
       'Danceability', 'Loudness..dB..', 'Liveness', 'Valence.', 'Length.',
       'Acousticness..', 'Speechiness.', 'Popularity'],
      dtype='object')

In [4]:
df.index

Int64Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
            18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
            35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
           dtype='int64', name='ID')

In [5]:
df.shape

(50, 13)

In [6]:
df.dtypes

Track.Name          object
Artist.Name         object
Genre               object
Beats.Per.Minute     int64
Energy               int64
Danceability         int64
Loudness..dB..       int64
Liveness             int64
Valence.             int64
Length.              int64
Acousticness..       int64
Speechiness.         int64
Popularity           int64
dtype: object

In [7]:
df.size # totoal elements (cells)

650

# Statisical Functions

In [8]:
# self explanitory functions.
df.count()
df.sum()
df.product()

df.mean()
df.median()
df.mode()
df.min()
df.max()

df.std()
df.var()
df.skew()
df.kurtosis()

Beats.Per.Minute   -0.577184
Energy             -0.706359
Danceability        2.767259
Loudness..dB..      1.021652
Liveness            4.858902
Valence.           -0.806982
Length.             1.159347
Acousticness..      0.514041
Speechiness.        0.825746
Popularity          3.709390
dtype: float64

In [9]:
df.describe()

Unnamed: 0,Beats.Per.Minute,Energy,Danceability,Loudness..dB..,Liveness,Valence.,Length.,Acousticness..,Speechiness.,Popularity
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,120.06,64.06,71.38,-5.66,14.66,54.6,200.96,22.16,12.48,87.5
std,30.898392,14.231913,11.92988,2.056448,11.118306,22.336024,39.143879,18.995553,11.161596,4.491489
min,85.0,32.0,29.0,-11.0,5.0,10.0,115.0,1.0,3.0,70.0
25%,96.0,55.25,67.0,-6.75,8.0,38.25,176.75,8.25,5.0,86.0
50%,104.5,66.5,73.5,-6.0,11.0,55.5,198.0,15.0,7.0,88.0
75%,137.5,74.75,79.75,-4.0,15.75,69.5,217.5,33.75,15.0,90.75
max,190.0,88.0,90.0,-2.0,58.0,95.0,309.0,75.0,46.0,95.0


In [10]:
df.quantile(.25)

Beats.Per.Minute     96.00
Energy               55.25
Danceability         67.00
Loudness..dB..       -6.75
Liveness              8.00
Valence.             38.25
Length.             176.75
Acousticness..        8.25
Speechiness.          5.00
Popularity           86.00
Name: 0.25, dtype: float64

In [11]:
# ranks each value compared to other values within column. Enter axis=1 to compare values within rows
df.rank()

Unnamed: 0_level_0,Track.Name,Artist.Name,Genre,Beats.Per.Minute,Energy,Danceability,Loudness..dB..,Liveness,Valence.,Length.,Acousticness..,Speechiness.,Popularity
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,34.0,43.5,10.5,30.0,13.0,31.5,20.0,13.0,40.5,20.5,7.0,4.0,3.0
2,7.0,2.0,48.5,26.0,46.0,37.0,40.0,13.0,28.5,49.0,13.0,29.5,46.0
3,49.0,3.5,17.5,50.0,45.0,2.0,40.0,38.5,38.0,19.0,19.5,50.0,12.0
4,4.0,13.5,40.0,8.0,24.0,9.5,6.0,13.0,25.0,25.5,19.5,39.0,13.5
5,12.0,37.5,22.5,40.0,24.0,6.0,40.0,25.5,3.5,12.0,43.0,24.5,49.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
46,27.0,33.5,5.5,3.0,19.5,11.5,49.5,50.0,19.0,16.5,11.5,13.5,22.5
47,43.0,36.0,32.0,46.0,38.0,13.5,40.0,13.0,43.5,29.5,34.0,19.5,22.5
48,13.0,33.5,5.5,20.5,42.0,17.0,47.0,40.0,34.0,36.0,29.5,13.5,22.5
49,5.0,46.5,25.0,24.5,31.5,7.0,20.0,48.0,22.5,38.0,33.0,4.0,22.5


In [12]:
# shows correlation value between every possible column pair
df.corr() # df.cov() for covariance

Unnamed: 0,Beats.Per.Minute,Energy,Danceability,Loudness..dB..,Liveness,Valence.,Length.,Acousticness..,Speechiness.,Popularity
Beats.Per.Minute,1.0,0.043756,-0.094183,0.017016,-0.167286,-0.011586,-0.139288,-0.03145,0.557052,0.196097
Energy,0.043756,1.0,0.018254,0.670794,0.162768,0.43882,0.224677,-0.339892,-0.08986,-0.080295
Danceability,-0.094183,0.018254,1.0,0.016255,-0.149636,0.172829,-0.000185,-0.098165,-0.103472,-0.071413
Loudness..dB..,0.017016,0.670794,0.016255,1.0,0.258652,0.237614,0.219219,-0.1383,-0.272213,-0.043085
Liveness,-0.167286,0.162768,-0.149636,0.258652,1.0,0.016123,0.131782,0.021328,-0.125286,0.092564
Valence.,-0.011586,0.43882,0.172829,0.237614,0.016123,1.0,-0.017782,-0.052323,-0.053242,-0.317752
Length.,-0.139288,0.224677,-0.000185,0.219219,0.131782,-0.017782,1.0,-0.076293,0.046755,-0.087639
Acousticness..,-0.03145,-0.339892,-0.098165,-0.1383,0.021328,-0.052323,-0.076293,1.0,0.008293,-0.034684
Speechiness.,0.557052,-0.08986,-0.103472,-0.272213,-0.125286,-0.053242,0.046755,0.008293,1.0,0.238553
Popularity,0.196097,-0.080295,-0.071413,-0.043085,0.092564,-0.317752,-0.087639,-0.034684,0.238553,1.0


In [13]:
# calculates cumulative sum traveling down each column. Enter axis=1 to calculate across each row
df[['Energy','Liveness']].cumsum()

Unnamed: 0_level_0,Energy,Liveness
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,55,8
2,136,16
3,216,32
4,281,40
5,346,51
...,...,...
46,2900,660
47,2975,668
48,3054,685
49,3124,726


In [14]:
# original df for comparison
df[['Energy','Liveness']]

Unnamed: 0_level_0,Energy,Liveness
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,55,8
2,81,8
3,80,16
4,65,8
5,65,11
...,...,...
46,62,58
47,75,8
48,79,17
49,70,41


## Series Only Functions

In [15]:
df['Genre'].value_counts()

dance pop           8
pop                 7
latin               5
canadian hip hop    3
edm                 3
                   ..
pop house           1
atl hip hop         1
big room            1
r&b en espanol      1
boy band            1
Name: Genre, Length: 21, dtype: int64

In [16]:
df['Genre'].unique()

array(['canadian pop', 'reggaeton flow', 'dance pop', 'pop', 'dfw rap',
       'trap music', 'country rap', 'electropop', 'reggaeton',
       'panamanian pop', 'canadian hip hop', 'latin', 'escape room',
       'pop house', 'australian pop', 'edm', 'atl hip hop', 'big room',
       'boy band', 'r&b en espanol', 'brostep'], dtype=object)

In [17]:
df['Energy'].nlargest(3)

ID
35    88
29    86
26    82
Name: Energy, dtype: int64