In [8]:
import plotly.express as px
import pandas as pd
import os

# Directory in which the duration files are saved
directory = 'Data-Science-Project-WS2425/data/durations'

# List all CSV-files from the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

# List to save all dataframes
df_list = []

# Read all Files and extract the years
for file in csv_files:
    df = pd.read_csv(os.path.join(directory, file))
    
    # Prüfen, ob 'Duration_min' in der CSV existiert
    if 'Duration_min' not in df.columns:
        print(f"Skipping {file}: 'Duration_min' column not found")
        continue  # Überspringt die Datei, falls Spalte fehlt

    # Extract year from file name 
    year = int(file.split('_')[1])  
    
    # Add Year as a new column
    df['Year'] = year
    
    # Add data frame to list
    df_list.append(df)

# Join all data frames
if df_list:
    df_all_years = pd.concat(df_list, ignore_index=True)
else:
    raise ValueError("No valid CSV files found with 'Duration_min' column")

# Group by year and calculate mean
df_grouped = df_all_years.groupby("Year", as_index=False)["Duration_min"].mean()

# To convert into mm:ss
def format_duration(minutes):
    total_seconds = int(minutes * 60)  
    mm = total_seconds // 60  
    ss = total_seconds % 60   
    return f"{mm}:{ss:02d}"  



y_max = int(df_grouped["Duration_min"].max()) + 2  
y_ticks = list(range(0, y_max + 1))  
y_labels = [format_duration(y) for y in y_ticks]  


fig = px.bar(df_grouped, x="Year", y="Duration_min", title="Average song durations over 20 years")

# Adjust x- and y-axsis
fig.update_layout(
    xaxis=dict(
        tickmode="array",
        tickvals=df_grouped["Year"],
        tickformat=".0f",
        rangeslider=dict(visible=True),
        type="linear",
        fixedrange=True  
    ),
    yaxis=dict(
        title="Duration",
        tickvals=y_ticks,
        ticktext=y_labels,
        fixedrange=True  
    )
)

# Save as a html
fig.write_html("duration_barchart.html")


FileNotFoundError: [Errno 2] No such file or directory: '/Data-Science-Project-WS2425/data/durations'

In [12]:
import plotly.express as px
import pandas as pd
import os
import numpy as np
from dash import Dash, html, dcc, Input, Output

# Directory in which the duration files are saved
directory = 'Data-Science-Project-WS2425/data/durations'
# List all CSV-files from the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
# List to save all dataframes
df_list = []

# Read all Files and extract the years
for file in csv_files:
    df = pd.read_csv(os.path.join(directory, file))

    # Extract the year from the file name    
    year = int(file.split('_')[1])  
    df['Year'] = year
    df_list.append(df)


# Concatenate all dataframes into one,    
df_all_years = pd.concat(df_list, ignore_index=True)

# Function to remove outliers using IQR method
def remove_outliers(df, column):
    df_copy = df.copy()
    Q1 = df_copy[column].quantile(0.25)
    Q3 = df_copy[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Filter out the rows that fall outside the valid range
    return df_copy[(df_copy[column] >= lower_bound) & (df_copy[column] <= upper_bound)]


# Helper function to convert minutes to mm:ss format
def format_duration(minutes):
    total_seconds = int(minutes * 60)  
    mm = total_seconds // 60  
    ss = total_seconds % 60   
    return f"{mm}:{ss:02d}"

# Initialize Dash app
app = Dash(__name__, external_stylesheets=['https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css'])


# Define the layout
app.layout = html.Div([
    
    html.Div([
        html.Label('Show Outliers', style={'marginRight': '10px'}),
        dcc.Checklist(
            id='outlier-toggle',
            options=[{'label': '', 'value': 'show'}], # Option to show or hide outliers
            value=['show'], # Default value is to show outliers
            inline=True
        )
    ], style={'display': 'flex', 'alignItems': 'center', 'margin': '20px 0'}),
    
    dcc.Graph(id='duration-chart')
])


# Define callback to update the chart based on the user's selection
@app.callback(
    Output('duration-chart', 'figure'),
    Input('outlier-toggle', 'value')
)
def update_chart(show_outliers):
    df_original = df_all_years  # Original dataframe 
    
    # If outliers should be shown, use the original data
    if 'show' in show_outliers:
        df_to_use = df_original
        title = "Average Song Durations Over 20 Years (With Outliers)"
    else:
        df_to_use = remove_outliers(df_original, "Duration_min")
        title = "Average Song Durations Over 20 Years (Without Outliers)"
    

    # Group data by year and calculate the average duration for each year
    df_grouped = df_to_use.groupby("Year", as_index=False)["Duration_min"].mean()
    

    # Define the maximum value for the y-axis
    y_max = 5  
    y_ticks = list(range(0, y_max))  
    y_labels = [format_duration(y) for y in y_ticks]  
    

    # Create a bar chart
    fig = px.bar(df_grouped, x="Year", y="Duration_min", title=title)
    
    fig.update_layout(
        xaxis=dict(
            tickmode="array",
            tickvals=df_grouped["Year"],
            tickformat=".0f",
            rangeslider=dict(visible=True), # Enable range slider for x-axis
            type="linear",
            fixedrange=True  # Disable zooming
        ),
        yaxis=dict(
            title="Duration",
            tickvals=y_ticks,
            ticktext=y_labels,
            fixedrange=True,  
            range=[0, y_max]  
        )
    )
    
    return fig



if __name__ == '__main__':
    app.run(debug=True,port = 8051)

