# ANALYSIS AND HYPOTHESIS TESTING ON SPOTIFY TRACKS

## **If you like it do upvote it!! :)**

# Imports

In [None]:
#imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt
pd.set_option('max_rows',1000)
sns.set(rc={'figure.figsize':(15,10)})
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error,mean_absolute_error
from math import sqrt





In [None]:
df = pd.read_csv('../input/spotify-dataset-19212020-160k-tracks/data_o.csv')


**check for null values**

In [None]:
df.isna().sum()

In [None]:
df1 = df.copy()
df1['duration_ms'].head()

In [None]:
df1['duration_ms'] = df1['duration_ms']/1000
df1.rename({'duration_ms':'duration_in_seconds'},axis=1,inplace=True)
df1.info()

In [None]:
df[df.name=='We Will Rock You']

# Exploratory Data Analysis

# frequency distribution

In [None]:
from collections import defaultdict

frequency=defaultdict(lambda:0)

for i in df:
    frequency[i]=df[i].nunique()
    


frequency


fig,ax=plt.subplots(figsize=(25, 8))

ax.bar(frequency.keys(),frequency.values())

plt.show()


In [None]:
frequency

In [None]:
df1

In [None]:
df.popularity.unique()

In [None]:
df.hist(figsize=(15,15))
plt.show()

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(df1.corr(),annot=True)

In [None]:
g_pn = df1.groupby("name")['popularity'].sum().sort_values(ascending=False)[:20]
sns.set(rc={'figure.figsize':(15,8)})
axis = sns.barplot(g_pn.index, g_pn,palette='rocket')
axis.set_title('Top Tracks with Popularity')
axis.set_ylabel('Popularity')
axis.set_xlabel('Tracks')
plt.xticks(rotation = 90)

In [None]:
g_ap = df1.groupby("artists")['popularity'].sum().sort_values(ascending=False)[:20]
axis = sns.barplot(g_ap.index, g_ap,palette='magma_r')
sns.set(rc={'figure.figsize':(12,7)})
axis.set_title('Top Artists with Popularity')
axis.set_ylabel('Popularity')
axis.set_xlabel('Artists')
plt.xticks(rotation = 90)

In [None]:
columns = ["acousticness","danceability","energy","speechiness","liveness","valence"]
plt.figure(figsize=(15,10))
for c in columns:
    x = df1.groupby('year')[c].mean()
    sns.lineplot(x.index,x,label=c)
    
plt.title('Audio characteristics over year')
plt.xlabel('Year')
plt.ylabel('Characteristics')
plt.show()

In [None]:
g_an = df1.groupby('artists')['name'].count().sort_values(ascending=False)[:20]
axis = sns.barplot(g_an.index, g_an,palette='winter')
sns.set(rc={'figure.figsize':(12,7)})
axis.set_title('Top artists with tracks')
axis.set_ylabel('Track count')
axis.set_xlabel('Artists')
plt.xticks(rotation = 90)
plt.show()

In [None]:
g_an = df1.groupby('artists')['danceability'].mean().sort_values(ascending=False)[:20]
axis = sns.barplot(g_an.index, g_an,palette='summer')
axis.set_title('Top artists with danceability')
sns.set(rc={'figure.figsize':(12,7)})
axis.set_ylabel('danceability')
axis.set_xlabel('Artists')
plt.xticks(rotation = 90)
plt.show()

In [None]:
df1['year'].describe()

In [None]:
bins = [1920,1960,2000,2020]
df1['year_bins'] = pd.cut(df1['year'],bins,labels=['20s-60s','60s-2000','2000-2020'])
df1['year_bins'].head(10)

In [None]:
g_yp = df1.groupby('year_bins')['popularity'].mean().sort_values(ascending=False)[:20]
axis = sns.barplot(g_yp.index, g_yp,palette='autumn_r')
sns.set(rc={'figure.figsize':(12,7)})
axis.set_title('popularity categories')
axis.set_xlabel('Categories')
axis.set_ylabel('popularity')
#plt.xticks(rotation = 90)
plt.show()

In [None]:
plt.figure()
sns.set(rc={'figure.figsize':(12,7)})
sns.barplot('explicit','popularity',data=df1,palette='rocket_r')
plt.show()

In [None]:
sns.barplot('key','popularity',data=df1)
sns.set(rc={'figure.figsize':(12,7)})
plt.show()

**Histogram of Popularity Column**

In [None]:
df.popularity.hist(figsize=(15,15),bins=10)

In [None]:
np.random.seed(6)
popular = np.random.choice(a= df['popularity'], size=500)
print("sample mean",popular.mean())
print("population mean",df['popularity'].mean())

print("standard deviation",df['popularity'].std())

# Hypothesis Testing

# Z-test

**significance level (alpha) = 0.05**

In [None]:
from statsmodels.stats.weightstats import ztest

In [None]:
def ZTest(x1,value,s):



    z_value , p_value = ztest(x1 = x1,value =value )
    
    print('Z-score is :{}\n'.format(z_value))
    print('P-value is :{:.50f}\n'.format(p_value))
    
    if p_value<0.05:
        print('Null Hypothesis Rejected\n')
    else:
        print('Null Hypothesis is not Rejected\n')
    
    if(s=='below'):
      if(z_value<0):
        print("Alternative Hypothesis: Accepted \t Left Tailed test \n")
    else:
      if(z_value>0):
        print("Alternative Hypothesis: Accepted \t Right Tailed test \n")
        

Null Hypothesis: 'White Christmas was the most popular song"

Alternate Hypothesis : "White Christmas was not the most popular song"

In [None]:
ZTest(df[df['name']=='White Christmas']['popularity'],df['popularity'].mean(),'above')

As the P-value is greater than 0.05 so we can not reject the null hypothesis

so the white christmas is indeed the most popular

Null Hypothesis: 'Frank Sinatra was the most popular Artist"

Alternate Hypothesis : "Frank Sinatra was not the most popular Artist"

In [None]:
ZTest(df[df['artists']=="['Frank Sinatra']"]['popularity'],df['popularity'].mean(),'above')


As the P-value is not greater than 0.05 so we can reject the null hypothesis

so Frank sinatra was not the most popular artist

# t-test

**significance level (alpha) = 0.05**

In [None]:
import scipy.stats as stats
import math

def TTest(a,popmean,s):
    


    tscore,p_value=stats.ttest_1samp(a= a,popmean= popmean)
    
    print('T-score is :{}\n'.format(tscore))
    print('P-value is :{:.50f}\n'.format(p_value))
    
    if p_value<0.05:
        print('Null Hypothesis Rejected\n')
    else:
        print('Null Hypothesis is not Rejected\n')
    
    if(s=='below'):
      if(tscore<0):
        print("Alternative Hypothesis: Accepted \t Left Tailed test \n")
    else:
      if(tscore>0):
        print("Alternative Hypothesis: Accepted \t Right Tailed test \n")
    
    
    

In [None]:
np.random.seed(40)
chrs = np.random.choice(a= df['acousticness'], size=10000)

Null Hypothesis: 'Acousticness did not decreased over the years"

Alternate Hypothesis : "Acousticness decreased over the years"

In [None]:
TTest(chrs,df['acousticness'].mean(),'below')

As the P-value is greater than 0.05 so we can not reject the null hypothesis

so Acousticness did not decreased over the years

# F-Test (Chi Sq and ANOVA)

In [None]:
import statsmodels.api as sm
from scipy.stats import chi2_contingency

def chi_square_test(feature1,feature2):
    sample_table=pd.crosstab(df[feature1],df[feature2])
    
    col_sum = sample_table.sum(axis=0)
    col_percentage = (sample_table)/col_sum
    print("")
    print("Column percent are->>>>>")
    print(col_percentage)
    print("")
    alpha=0.05
    
    stat, p, dof, expected = chi2_contingency(sample_table)
    if p<=alpha:
        mssg="Dependent (Reject Null Hypothesis)"
    else:
        mssg="Independent (Null Hypothesis holds true)"
        
    print('Chi-square value is: {:.2f} \tp-value: {} \tdf: {} \tOutcome: {}'.format(stat, p,dof,mssg))

In [None]:
from scipy.stats import f_oneway
def anova(sample1,sample2,sample3):
    F,p=f_oneway(sample1,sample2,sample3)  
    if p<=0.05:
        result='Reject'
    else:
        result='Accept'
    print('F Statistic: {:.2f} \tp-value: {:.5f} \tNull Hypothesis: {}'.format(F, p,result))

# Chi Sq Test

alpha==0.05

Null Hypothesis : There is no relation between year of release and popularity of song

In [None]:
chi_square_test('popularity','year')

# ANOVA Test

alpha==0.05

Null Hypothesis : There is no variation between speechiness energy and danceability

In [None]:
np.random.seed(40)
speechiness = np.random.choice(a= df['speechiness'], size=100)
energy = np.random.choice(a= df['energy'], size=100)
danceability=np.random.choice(a= df['danceability'], size=100)


In [None]:
anova(speechiness,energy,danceability)