# Data Analysis
<em>Aaron Wollman, Albin Joseph, Kelsey Richardson Blackwell, Will Huang</em>

In this notebook, the code will ...

## Setup & Import Data

In [None]:
%matplotlib inline

In [None]:
# Dependencies
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import numpy as np
from scipy.stats import linregress
import scipy.stats as st

In [None]:
# Constants
class Filepaths:
    root = "../data/"
    music_unemployment = f"{root}music_and_unemployment.csv"
    billboard_hits = f"{root}billboard_hits.csv"
    unemployment = f"{root}unemployment.csv"

class Music_Unemploy_Cols:
    track = "Track"
    artist = "Artist"
    placement = "Placement"
    year = "Year"
    month = "Month"
    day = "Day"
    danceability = "danceability"
    energy = "energy"
    key = "key"
    loudness = "loudness"
    speechiness = "speechiness"
    acousticness = "acousticness"
    liveness = "liveness"
    valence = "valence"
    tempo = "tempo"
    unemploy_rate = "Unemployment Rate"

In [None]:
# Import Music and Unemployment CSV
music_unemployment = pd.read_csv(Filepaths.music_unemployment, index_col = 0)
music_unemployment.head()

## Unemployment

In [None]:
class Unemploy_Cols:
    year = "Year"
    month = "Month"
    rate = "Unemployment Rate"

In [None]:
unemployement_time = pd.read_csv(Filepaths.unemployment, index_col = 0)
unemployement_time.head()

In [None]:
unemployement_time_pivot=unemployement_time.pivot(
    Unemploy_Cols.year, Unemploy_Cols.month, Unemploy_Cols.rate)
unemployement_time_pivot.head()

In [None]:
# Display the Standard Deviation per Year
temp=unemployement_time_pivot.copy()
temp['STD']=[statistics.stdev(temp.loc[index,:]) for index,row in temp.iterrows()]
temp.head()

To show the how the unemployment rate has changed over time, the code will use a heatmap.  The darker the shade of blue, the higher the unemployment rate.

In [None]:
# Show all years in one heatmap.
plt.figure(figsize=(15,20))
sns.heatmap(unemployement_time_pivot,cmap=("Blues"))
plt.title("Unemployment Rate between 1960 and 2020")
plt.show()

In [None]:
# Show the heatmap per decade.  Should be equivalent to the above graph.
vmax=unemployement_time_pivot.max().max()
vmin=unemployement_time_pivot.min().min()

In [None]:
fig,axes=plt.subplots(7,1,figsize=(10,20),sharex=True)
i = 0
for axis in axes:
    data = unemployement_time_pivot[i*10 : (i+1) * 10]
    axis.set_title(f"Unemployment in the {1960 + (i*10)}s")
    sns.heatmap(data,cmap=("Blues"),ax=axis,vmax=vmax,vmin=vmin)
    i += 1

plt.show()

## Song Valence

[Spotify's API](https://developer.spotify.com/documentation/web-api/reference/tracks/get-audio-features/) defines a song's valence as:

<blockquote>"A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry)."</blockquote>

For this project, this can be considered our "happiness" metric.

## Song Valence vs Unemployment

To compare the song's valence to the unemployment, the code will first try to find a correlation between each month's average valence and the unemployment rate.

In [None]:
def regression_plot(dataframe, x_col, y_col):
    # Plot the scatter plot
    dataframe.plot(kind="scatter", x = x_col, y = y_col)
    
    # Calculate the correlation coefficient and linear regression model 
    x_values = dataframe[x_col]
    y_values = dataframe[y_col]
    (slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
    regress_values = x_values * slope + intercept
    equation = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
    eq_label = f"{equation} \nr-squared = {round(rvalue * rvalue, 3)}"
    regress_plot, = plt.plot(x_values, regress_values, "r-", label=eq_label)
    plt.legend(handles=[regress_plot], loc="best")

In [None]:
# Group by the song's date
date_cols = [Music_Unemploy_Cols.year, 
             Music_Unemploy_Cols.month, 
             Music_Unemploy_Cols.day]
music_unemployment_gb = music_unemployment.groupby(date_cols)

# Find the average of unemployment rate and weighed valence for each date
avg_music_unemploy = music_unemployment_gb.mean()
rate_v_valence = avg_music_unemploy[[Music_Unemploy_Cols.unemploy_rate, 
                                     Music_Unemploy_Cols.valence]]

# Create a Scatter Graph
regression_plot(rate_v_valence, 
                Music_Unemploy_Cols.unemploy_rate, 
                Music_Unemploy_Cols.valence)
plt.title("Unemployment Rate vs. Valence (Happiness)")
plt.xlabel("Unemployment Rate")
plt.ylabel("Valence (Happiness)")
plt.show()

From the above graph, there is <b>not</b> a good correlation between valence and the unemployment rate. With the above, the data doesn't take the song's placement in the Top 100 into account.  Let's try again using a weighted average of the Top 100.

In [None]:
# Create a new data point "Weighted Valence"
Music_Unemploy_Cols.weighed_valence = "weighed valence"
weights = (101 - music_unemployment[Music_Unemploy_Cols.placement])
weighed_valence = music_unemployment[Music_Unemploy_Cols.valence] * weights
music_unemployment[Music_Unemploy_Cols.weighed_valence] = weighed_valence
music_unemployment.head()

In [None]:
# Group by the song's date
music_unemployment_gb = music_unemployment.groupby(date_cols)

# Find the average of unemployment rate and weighed valence for each date
avg_music_unemploy = music_unemployment_gb.mean()
rate_v_valence = avg_music_unemploy[[Music_Unemploy_Cols.unemploy_rate, 
                                     Music_Unemploy_Cols.weighed_valence]]

# Create a Scatter Graph
regression_plot(rate_v_valence, 
                Music_Unemploy_Cols.unemploy_rate, 
                Music_Unemploy_Cols.weighed_valence)
plt.title("Unemployment Rate vs. Valence (Happiness)")
plt.xlabel("Unemployment Rate")
plt.ylabel("Weighed Valence (Happiness)")
plt.show()

Even with a weighted average, there still isn't a good correlation between the average valence and the unemployment rate. Let's now look to see if there is another musical attribute that might correlate to unemployment.

## Finding an Alternative Music Attribute

In [None]:
unemployment_rate_list=[]
for i in range(len(unemployement_time_pivot)):
    for j in unemployement_time_pivot.iloc[i,1:]:
        unemployment_rate_list.append(j)
unemployment_rate_list[:5]

In [None]:
plt.boxplot(unemployment_rate_list)
plt.show()

In [None]:
high_unemployment_rate=np.quantile(unemployment_rate_list, .75) ###7.0
data["weighed valence"] = data["valence"] * (101 - data["Placement"])
data['High_Unemployment']=data['Unemployment Rate'].apply(lambda x: 1 if x>=high_unemployment_rate else 0)

data.head()

In [None]:
music_unemployment["weighed valence"] = music_unemployment["valence"] * (101 - music_unemployment["Placement"])
music_unemployment['weighed danceability']=music_unemployment["danceability"] * (101 - music_unemployment["Placement"])
music_unemployment['weighed energy']=music_unemployment["energy"] * (101 - music_unemployment["Placement"])
music_unemployment['weighed key']=music_unemployment["key"] * (101 - music_unemployment["Placement"])
music_unemployment['weighed loudness']=music_unemployment["loudness"] * (101 - music_unemployment["Placement"])
music_unemployment['weighed speechiness']=music_unemployment["speechiness"] * (101 - music_unemployment["Placement"])
music_unemployment['weighed acousticness']=music_unemployment["acousticness"] * (101 - music_unemployment["Placement"])
music_unemployment['weighed liveness']=music_unemployment["liveness"] * (101 - music_unemployment["Placement"])
music_unemployment['weighed tempo']=music_unemployment["tempo"] * (101 - music_unemployment["Placement"])
music_unemployment['High_Unemployment']=music_unemployment['Unemployment Rate'].apply(lambda x: 1 if x>=high_unemployment_rate else 0)

In [None]:
music_unemployment.head()

In [None]:
music_unemployment_group=music_unemployment.groupby(['Year','Month','Day'])[['High_Unemployment','Unemployment Rate',
       'weighed valence', 'weighed danceability', 'weighed energy',
       'weighed key', 'weighed loudness', 'weighed speechiness',
       'weighed acousticness', 'weighed liveness', 'weighed tempo']].mean()

In [None]:
compare_list=['weighed valence', 'weighed danceability', 'weighed energy',
       'weighed key', 'weighed loudness', 'weighed speechiness',
       'weighed acousticness', 'weighed liveness', 'weighed tempo']
for i in compare_list:
    plt.figure(figsize=(10,5))
    sns.scatterplot(x='Unemployment Rate',y=i,hue='High_Unemployment',data=music_unemployment_group)

In [None]:
music_unemployment_group[i][music_unemployment_group['High_Unemployment']==1]

In [None]:
compare_list

In [None]:
row=0
col=0
fig,axes=plt.subplots(3,3,figsize=(15,15))
for i in compare_list:
    if col>2:
        row+=1
        col=0
        sns.boxplot(x='High_Unemployment',y=i,data=music_unemployment_group[[i,'High_Unemployment']],ax=axes[row][col])
        col+=1
        
    else:
        sns.boxplot(x='High_Unemployment',y=i,data=music_unemployment_group[[i,'High_Unemployment']],ax=axes[row][col])
        col+=1
        

In [None]:
music_unemployment_group.columns

In [None]:
statistic_list=[]
pvalue_list=[]
for i in compare_list:
    group1=music_unemployment_group[i][music_unemployment_group['High_Unemployment']==1]
    group2=music_unemployment_group[i][music_unemployment_group['High_Unemployment']==0]
    statistic=st.f_oneway(group1,group2)[0]
    pvalue=st.f_oneway(group1,group2)[1]
    statistic_list.append(statistic)
    pvalue_list.append(pvalue)
    print(f' ANOVA Result for {i} vs. High_Unemployment\n {st.f_oneway(group1,group2)}\n==================')

In [None]:
siginificant_list=[1 if i <=0.05 else 0 for i in pvalue_list]
anova=pd.DataFrame({'Feature':compare_list,'Statistic':statistic_list,'Pvalue':pvalue_list,'Siginificant':siginificant_list})
anova.sort_values('Pvalue')

In [None]:
anova.sort_values('Siginificant',ascending=False)

In [None]:
row=0
col=0
fig,axes=plt.subplots(3,3,figsize=(15,15))
for i in ['weighed danceability',
'weighed energy',
'weighed speechiness',
'weighed acousticness',
'weighed tempo']:
    if col>2:
        row+=1
        col=0
        sns.boxplot(x='High_Unemployment',y=i,data=music_unemployment_group[[i,'High_Unemployment']],ax=axes[row][col])
        col+=1
        
    else:
        sns.boxplot(x='High_Unemployment',y=i,data=music_unemployment_group[[i,'High_Unemployment']],ax=axes[row][col])
        col+=1
        

In [None]:
# Aar