<a href="https://colab.research.google.com/github/pravallikai/Evolution-of-musical-trends-using-py/blob/main/notebooks/3_data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os, sys, pathlib
import pandas as pd

# connect to your GitHub repo
GITHUB_USER = "pravallikai"
REPO_NAME = "Evolution-of-musical-trends-using-py"

if not pathlib.Path(REPO_NAME).exists():
    !git clone https://github.com/{GITHUB_USER}/{REPO_NAME}.git

os.chdir(REPO_NAME)

print("✅ Repo connected. Files:", os.listdir("data/raw"))


Cloning into 'Evolution-of-musical-trends-using-py'...
remote: Enumerating objects: 87, done.[K
remote: Counting objects: 100% (87/87), done.[K
remote: Compressing objects: 100% (77/77), done.[K
remote: Total 87 (delta 23), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (87/87), 4.47 MiB | 6.47 MiB/s, done.
Resolving deltas: 100% (23/23), done.
✅ Repo connected. Files: ['dataset-of-1980s.csv', 'dataset-of-1970s.csv', 'dataset-of-2010s.csv', '.gitkeep', 'dataset-of-2000s.csv', 'dataset-of-1990s.csv', 'dataset-of-1960s.csv', 'mini_tracks.csv']


In [13]:
df = pd.read_csv("data/processed/spotify_audio_features_1960_2019.csv")
print("✅ Loaded combined dataset:", df.shape)
df.head()


✅ Loaded combined dataset: (41106, 13)


Unnamed: 0,year,track,artist,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,source_decade
0,1960's,Jealous Kind Of Fella,Garland Green,0.417,0.62,-7.727,0.0403,0.49,0.0,0.0779,0.845,185.655,1960s
1,1960's,Initials B.B.,Serge Gainsbourg,0.498,0.505,-12.475,0.0337,0.018,0.107,0.176,0.797,101.801,1960s
2,1960's,Melody Twist,Lord Melody,0.657,0.649,-13.392,0.038,0.846,4e-06,0.119,0.908,115.94,1960s
3,1960's,Mi Bomba Sonó,Celia Cruz,0.59,0.545,-12.058,0.104,0.706,0.0246,0.061,0.967,105.592,1960s
4,1960's,Uravu Solla,P. Susheela,0.515,0.765,-3.515,0.124,0.857,0.000872,0.213,0.906,114.617,1960s


In [14]:
!ls -R data | head -40


data:
processed
raw

data/processed:
spotify_audio_features_1960_2019.csv

data/raw:
dataset-of-1960s.csv
dataset-of-1970s.csv
dataset-of-1980s.csv
dataset-of-1990s.csv
dataset-of-2000s.csv
dataset-of-2010s.csv
mini_tracks.csv


In [15]:
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# fix year column (handle values like "1960's")
df["year"] = df["year"].astype(str).str.extract(r"(\d{4})").astype(float)
df["year"] = df["year"].astype("Int64")

print("✅ Columns standardized. Sample years:")
print(df["year"].dropna().unique()[:10])


✅ Columns standardized. Sample years:
<IntegerArray>
[1960, 1970, 1980, 1990, 2000, 2010]
Length: 6, dtype: Int64


In [16]:
keep_cols = [
    "year","track","artist","danceability","energy",
    "loudness","speechiness","acousticness","instrumentalness",
    "valence","tempo"
]
df = df[keep_cols]
print("✅ Important columns selected:", len(df.columns))
df.head()


✅ Important columns selected: 11


Unnamed: 0,year,track,artist,danceability,energy,loudness,speechiness,acousticness,instrumentalness,valence,tempo
0,1960,Jealous Kind Of Fella,Garland Green,0.417,0.62,-7.727,0.0403,0.49,0.0,0.845,185.655
1,1960,Initials B.B.,Serge Gainsbourg,0.498,0.505,-12.475,0.0337,0.018,0.107,0.797,101.801
2,1960,Melody Twist,Lord Melody,0.657,0.649,-13.392,0.038,0.846,4e-06,0.908,115.94
3,1960,Mi Bomba Sonó,Celia Cruz,0.59,0.545,-12.058,0.104,0.706,0.0246,0.967,105.592
4,1960,Uravu Solla,P. Susheela,0.515,0.765,-3.515,0.124,0.857,0.000872,0.906,114.617


In [17]:
df = df.dropna(subset=["year","danceability","energy","valence","tempo","loudness"])
df["year"] = df["year"].astype(int)

# Save to processed folder
processed_path = pathlib.Path("data/processed")
processed_path.mkdir(parents=True, exist_ok=True)

output_path = processed_path / "spotify_audio_features_1960_2019_clean.csv"
df.to_csv(output_path, index=False)

print("✅ Clean dataset saved to:", output_path)
print("Rows after cleaning:", len(df))


✅ Clean dataset saved to: data/processed/spotify_audio_features_1960_2019_clean.csv
Rows after cleaning: 41106


In [18]:
print("Years range:", df["year"].min(), "to", df["year"].max())
print("Missing values (%):")
print((df.isna().mean()*100).round(2))
df.describe()[["danceability","energy","valence","tempo","loudness"]]


Years range: 1960 to 2010
Missing values (%):
year                0.0
track               0.0
artist              0.0
danceability        0.0
energy              0.0
loudness            0.0
speechiness         0.0
acousticness        0.0
instrumentalness    0.0
valence             0.0
tempo               0.0
dtype: float64


Unnamed: 0,danceability,energy,valence,tempo,loudness
count,41106.0,41106.0,41106.0,41106.0,41106.0
mean,0.539695,0.579545,0.54244,119.338249,-10.221525
std,0.177821,0.252628,0.267329,29.098845,5.311626
min,0.0,0.000251,0.0,0.0,-49.253
25%,0.42,0.396,0.33,97.397,-12.816
50%,0.552,0.601,0.558,117.565,-9.257
75%,0.669,0.787,0.768,136.494,-6.37425
max,0.988,1.0,0.996,241.423,3.744
