# Additional cleaned data checks

## Import libraries

In [61]:
import pandas as pd
import scipy.stats
from sklearn.model_selection import train_test_split

## Read the data

In [62]:
df = pd.read_csv("data/processed_data.csv", low_memory=False)
df.sample(5)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,duration_ms,time_signature,genre,song_name,title,text_feature
4203,0.814,0.483,1,-6.235,0,0.0415,0.00774,0.00679,0.26,0.265,114.03,audio_features,134748,3,Dark Trap,Banana Clip,,Banana Clip
35817,0.636,0.993,10,-2.543,0,0.488,0.0102,0.247,0.0441,0.53,149.774,audio_features,232842,4,trap,,I'm a fucking Headbanger,I'm a fucking Headbanger
15983,0.44,0.727,11,-7.336,1,0.212,0.0522,0.0,0.103,0.653,179.99,audio_features,192333,4,RnB,That's My,,That's My
13375,0.509,0.93,0,-6.235,0,0.0474,0.00137,0.0,0.152,0.783,154.95,audio_features,149640,4,Emo,False Pretense,,False Pretense
4666,0.476,0.781,0,-4.71,1,0.103,0.0237,0.0,0.114,0.175,186.948,audio_features,123661,3,Underground Rap,ProductOfDrugs (Prod. The Virus and Antidote),,ProductOfDrugs (Prod. The Virus and Antidote)


## Explore the data

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41975 entries, 0 to 41974
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   danceability      41975 non-null  float64
 1   energy            41975 non-null  float64
 2   key               41975 non-null  int64  
 3   loudness          41975 non-null  float64
 4   mode              41975 non-null  int64  
 5   speechiness       41975 non-null  float64
 6   acousticness      41975 non-null  float64
 7   instrumentalness  41975 non-null  float64
 8   liveness          41975 non-null  float64
 9   valence           41975 non-null  float64
 10  tempo             41975 non-null  float64
 11  type              41975 non-null  object 
 12  duration_ms       41975 non-null  int64  
 13  time_signature    41975 non-null  int64  
 14  genre             41975 non-null  object 
 15  song_name         21234 non-null  object 
 16  title             20735 non-null  object

## Define and assert data quality

### Check for required columns and their types

In [64]:
def test_column_presence_and_type(data):

    required_columns = {
        "time_signature":   pd.api.types.is_integer_dtype,
        "key":              pd.api.types.is_integer_dtype,
        "danceability":     pd.api.types.is_float_dtype,
        "energy":           pd.api.types.is_float_dtype,
        "loudness":         pd.api.types.is_float_dtype,
        "speechiness":      pd.api.types.is_float_dtype,
        "acousticness":     pd.api.types.is_float_dtype,
        "instrumentalness": pd.api.types.is_float_dtype,
        "liveness":         pd.api.types.is_float_dtype,
        "valence":          pd.api.types.is_float_dtype,
        "tempo":            pd.api.types.is_float_dtype,
        "duration_ms":      pd.api.types.is_integer_dtype,
        "text_feature":     pd.api.types.is_string_dtype,
        "genre":            pd.api.types.is_string_dtype
    }
    
    assert set(data.columns.values).issuperset(set(required_columns.keys()))
    
    for col_name, format_verification_funct in required_columns.items():
        assert format_verification_funct(data[col_name]), \
        f"Column {col_name} failed test {format_verification_funct}"

### Check for known genres 

In [65]:
def test_genres(data):
    
    known_genres = [
        "Dark Trap",
        "Underground Rap",
        "Trap Metal",
        "Emo",
        "Rap",
        "RnB",
        "Pop",
        "Hiphop",
        "techhouse",
        "techno",
        "trance",
        "psytrance",
        "trap",
        "dnb",
        "hardstyle"
    ]
    
    assert data["genre"].isin(known_genres).all()

### Check for column ranges

In [66]:
def test_column_ranges(data):
    
    column_ranges = {
        "time_signature": (1, 5),
        "key": (0, 11),
        "danceability": (0, 1),
        "energy": (0, 1),
        "loudness": (-35, 5),
        "speechiness": (0, 1),
        "acousticness": (0, 1),
        "instrumentalness": (0, 1),
        "liveness": (0, 1),
        "valence": (0, 1),
        "tempo": (50, 250),
        "duration_ms": (20000, 1000000)
    }
    
    for col_name, (minimum, maximum) in column_ranges.items():
        assert data[col_name].dropna().between(minimum, maximum).all(), (
            f"Column {col_name} failed the test. Should be between {minimum} and {maximum}, "
            f"instead min={data[col_name].min()} and max={data[col_name].max()}" 
        )

### Kolmogorov-Smirnov test

In [67]:
# Check for correct data distribution

def test_kolmogorov_smirnov(data, ks_alpha):
    sample1, sample2 = train_test_split(df, test_size=0.5) # split data into 2 samples
    
    columns = [
        "danceability",
        "energy",
        "loudness",
        "speechiness",
        "acousticness",
        "instrumentalness",
        "liveness",
        "valence",
        "tempo",
        "duration_ms"
    ]
    
    # Bonferroni correction for multiple hypothesis testing
    # See more: https://towardsdatascience.com/precision-and-recall-trade-off-and-multiple-hypothesis-testing-family-wise-error-rate-vs-false-71a85057ca2b)
    alpha_prime = 1 - (1 - ks_alpha)**(1 / len(columns))
    
    for col in columns:
        ts, p_value = scipy.stats.ks_2samp(sample1[col], sample2[col])
        
        # NOTE: as always, the p-value should be interpreted as the probability of
        # obtaining a test statistic (TS) equal or more extreme that the one we got
        # by chance, when the null hypothesis is true. If this probability is not
        # large enough, this dataset should be looked at carefully, hence we fail
        
        assert p_value > alpha_prime

In [68]:
test_column_presence_and_type(df)

In [70]:
test_genres(df)

In [71]:
test_column_ranges(df)

In [73]:
ks_alpha = 0.05
test_kolmogorov_smirnov(df, ks_alpha)

In [74]:
# If none of the tests thrown any error they passed