In [4]:
import pandas as pd
import os

# Directory where the CSV files are stored
directory = 'Data-Science-Project-WS2425/data/tempo§#'

# List all CSV files from the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

# List to save all dataframes
df_list = []

# Read all files and extract the years
for file in csv_files:
    df = pd.read_csv(os.path.join(directory, file))

    # Extract year from file name
    year = int(file.split('_')[1])

    # Add Year as a new column
    df['Year'] = year
    
    # Add dataframe to list
    df_list.append(df)

# Join all dataframes
df_all_years_tempo = pd.concat(df_list, ignore_index=True)

# Fix "Tempo" Column: Remove spaces, "BPM", and handle errors
df_all_years_tempo["Tempo"] = (
    df_all_years_tempo["Tempo"]
    .astype(str)  # Ensure all values are strings
    .str.strip()  # Remove leading/trailing spaces
    .str.replace(r"\s*BPM", "", regex=True)  # Remove "BPM"
    .apply(pd.to_numeric, errors="coerce")  # Convert to float, set errors to NaN
)

# Drop NaN values (optional, depending on your data)
df_all_years_tempo = df_all_years_tempo.dropna(subset=["Tempo"])

# Convert to int
df_all_years_tempo["Tempo"] = df_all_years_tempo["Tempo"].astype(int)

# Group by year and calculate the mean
df_grouped = df_all_years_tempo.groupby("Year", as_index=False)["Tempo"].mean()


# Remove "" from titles
df_all_years_tempo["Title"] = df_all_years_tempo["Title"].str.replace(r'^"|"$', '', regex=True)


print(df_all_years_tempo.head())

# Print all years in ascending order
print("\nAll Years:")
print(sorted(df_all_years_tempo["Year"].unique()))


   No.        Title                                     Artist  Tempo  Year  \
0  1.0   God's Plan                                      Drake     77  2018   
1  2.0      Perfect                                 Ed Sheeran     97  2018   
2  3.0  Meant to Be  Bebe Rexha featuring Florida Georgia Line    154  2018   
3  4.0       Havana        Camila Cabello featuring Young Thug    105  2018   
4  5.0     Rockstar            Post Malone featuring 21 Savage    160  2018   

    №  
0 NaN  
1 NaN  
2 NaN  
3 NaN  
4 NaN  

All Years:
[np.int64(2005), np.int64(2006), np.int64(2007), np.int64(2008), np.int64(2009), np.int64(2010), np.int64(2011), np.int64(2012), np.int64(2013), np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2023), np.int64(2024)]


In [32]:
import plotly.express as px

df_grouped = df_grouped.sort_values(by="Year")

# Line Chart
fig_line = px.line(df_grouped, x="Year", y="Tempo", 
                   title="Average Tempo (BPM) Of Songs In The Last 20 Years",
                   markers=True, line_shape="spline")


fig_line.update_xaxes(type='category')

fig_line.show()


In [33]:

df_all_years_tempo = df_all_years_tempo.sort_values(by="Year")

fig_violin = px.violin(df_all_years_tempo, x="Year", y="Tempo", 
                        box=True, points="all", 
                        title="Average Tempo (BPM) Of Songs In The Last 20 Years")

# X-Achse als diskrete Werte setzen
fig_violin.update_xaxes(type='category')

fig_violin.show()



In [34]:
import plotly.express as px

# Scatterplot for all the songs
fig = px.scatter(df_all_years_tempo, x="Year", y="Tempo",
                 title="Average Tempo (BPM) Of Songs In The Last 20 Years",
                 labels={"Tempo", "Year"},
                 opacity=0.3)  # Slightly transparent

# Line for the average tempo values
fig.add_scatter(x=df_grouped["Year"], y=df_grouped["Tempo"], 
                mode="lines", name="Average", line=dict(width=2))


fig.update_xaxes(type='category')

fig.show()

