<a href="https://colab.research.google.com/github/pravallikai/Evolution-of-musical-trends-using-py/blob/main/notebooks/10_Genre_Wise_Deep_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This is the header for the notebook file, indicating its purpose.
# 10_Genre_Wise_Deep_Analysis.ipynb
# ---------------------------------------------------

# This section imports necessary libraries for data manipulation, visualization, and statistical analysis.
# Imports
import pandas as pd # Import the pandas library for data manipulation and analysis, often used for DataFrames.
import numpy as np # Import the numpy library for numerical operations, especially for array and matrix computations.
import matplotlib.pyplot as plt # Import the matplotlib.pyplot module for creating static, interactive, and animated visualizations in Python.
import seaborn as sns # Import the seaborn library for statistical data visualization, which is built on matplotlib.
import os # Import the os module for interacting with the operating system, such as managing directories and file paths.
from scipy import stats # Import the stats submodule from the scipy library, which contains many statistical functions, including ANOVA.

# Set the visual style for all seaborn plots to 'whitegrid', which adds a white background with gray grid lines.
sns.set(style="whitegrid") # Sets the aesthetic style of the plots.
# Define the directory path where figures (plots) will be saved.
FIGDIR = "reports/figures/genre_analysis" # String variable storing the path for figure output.
# Define the directory path where tables (CSV summaries) will be saved.
TABDIR = "reports/tables" # String variable storing the path for table output.

# Create the figures directory if it does not already exist. 'exist_ok=True' prevents an error if the directory already exists.
os.makedirs(FIGDIR, exist_ok=True) # Ensures the output directory for figures is available.
# Create the tables directory if it does not already exist. 'exist_ok=True' prevents an error if the directory already exists.
os.makedirs(TABDIR, exist_ok=True) # Ensures the output directory for tables is available.

# This section is responsible for loading the dataset, checking for its existence, and handling missing files.
# Load cleaned + clustered + PCA dataset (assumes prior notebooks created cluster, PCA)
# Define the primary potential path for the dataset file, which is a CSV file.
path_clustered = "/content/spotify_audio_features_1960_2019_clustered.csv"
# Define a secondary potential path for the dataset file, another CSV file.
path_clean = "/content/spotify_audio_features_1960_2019_clean.csv"
# Initialize a variable to store the path of the successfully loaded dataset; initially set to None.
CLEANED_PATH = None

# Check if the file specified by 'path_clustered' exists in the file system.
if os.path.exists(path_clustered):
    # If it exists, assign this path to CLEANED_PATH.
    CLEANED_PATH = path_clustered
# Else if the file specified by 'path_clean' exists.
elif os.path.exists(path_clean):
    # If it exists, assign this path to CLEANED_PATH.
    CLEANED_PATH = path_clean

# Check if CLEANED_PATH has been successfully assigned a file path.
if CLEANED_PATH:
    # If a valid path is found, read the CSV file into a pandas DataFrame.
    df = pd.read_csv(CLEANED_PATH) # Reads the data from the CSV into 'df'.
    # Print the shape (number of rows, number of columns) of the loaded DataFrame.
    print("Loaded:", df.shape) # Confirms successful loading and shows DataFrame dimensions.
# If neither of the specified files were found.
else:
    # Construct an informative error message for the user.
    error_message = (
        f"Error: Neither '{path_clustered}' nor '{path_clean}' were found. " # Part 1 of the error message.
        "Please upload at least one of these CSV files to the '/content/' directory " # Part 2 of the error message, suggesting user action.
        "in your Colab environment to proceed with the analysis." # Part 3 of the error message, explaining the necessity.
    )
    # Print the error message to the console.
    print(error_message) # Displays the error to the user.
    # Raise a FileNotFoundError to stop execution and indicate a critical missing resource.
    raise FileNotFoundError(error_message) # Halts script execution due to missing file.

# This section ensures that 'year' and 'decade' columns are present and correctly formatted in the DataFrame.
# Ensure year/decade exist
# Check if a column named 'year' is not present in the DataFrame.
if "year" not in df.columns: # Evaluates if the 'year' column is missing.
    # If 'year' is missing, check if 'release_year' column exists.
    if "release_year" in df.columns: # Evaluates if 'release_year' column is present.
        # If 'release_year' exists, create 'year' column by converting 'release_year' to integer type.
        df["year"] = df["release_year"].astype(int) # Assigns 'release_year' as 'year', ensuring integer type.
    # If 'release_year' also does not exist.
    else: # This block executes if neither 'year' nor 'release_year' is found.
        # Attempt to extract the year from a 'release_date' column, handling potential errors by coercing them to NaT (Not a Time).
        df["year"] = pd.to_datetime(df["release_date"], errors="coerce").dt.year # Extracts year from 'release_date' string/datetime.
# After attempting to create 'year', remove any rows where 'year' is still missing (NaN).
df = df.dropna(subset=["year"]) # Drops rows where 'year' could not be determined.
# Ensure the 'year' column is explicitly an integer type after any conversions and drops.
df["year"] = df["year"].astype(int) # Converts 'year' to integer type for consistency.
# Calculate the 'decade' for each song by taking the year, performing integer division by 10, and multiplying by 10.
df["decade"] = (df["year"] // 10) * 10 # Creates the 'decade' column, e.g., 1965 becomes 1960.

# This section handles the 'genre' column, ensuring it exists and extracting a primary genre for consistency.
# Ensure genre column exists and standardize
# Check if a column named 'genre' is not present in the DataFrame.
if "genre" not in df.columns: # Evaluates if the 'genre' column is missing.
    # If 'genre' is missing, check if 'genres' column exists as an alternative.
    if "genres" in df.columns: # Evaluates if 'genres' column is present.
        # If 'genres' exists, use it to create the 'genre' column.
        df["genre"] = df["genres"] # Assigns 'genres' as 'genre'.
    # If neither 'genre' nor 'genres' exist.
    else: # This block executes if no genre-like column is found.
        # Create a 'genre' column and fill it with the string "Unknown" as a placeholder.
        df["genre"] = "Unknown" # Sets 'genre' to 'Unknown' when no information is available.

# This function is defined to extract the primary genre from potentially complex genre strings (e.g., lists, semicolon-separated).
# OPTIONAL: If genre contains lists (strings like "['pop', 'dance']") try to extract primary genre
def extract_primary_genre(x): # Defines a function to process a single genre entry 'x'.
    # If the input 'x' is a pandas NaN value, return "Unknown".
    if pd.isna(x): return "Unknown" # Handles missing genre values.
    # If the input 'x' is a list.
    if isinstance(x, list): # Checks if 'x' is a list type.
        # Return the first element of the list if the list is not empty, otherwise return "Unknown".
        return x[0] if len(x)>0 else "Unknown" # Extracts the first genre from a list.
    # If the input 'x' is a string.
    if isinstance(x, str): # Checks if 'x' is a string type.
        # Common pattern: "['pop', 'dance']" or "pop;dance" - these are comments within the function for pattern examples.
        # Check if the string starts with '[' and ends with ']', indicating a list-like string.
        if x.startswith("[") and x.endswith("]"): # Identifies strings that look like Python list representations.
            # Remove the brackets from the string.
            inner = x.strip("[]") # Removes '[' and ']' from the ends of the string.
            # Split the inner string by comma, strip whitespace, remove quotes, and filter out empty parts.
            parts = [p.strip().strip("'\"") for p in inner.split(",") if p.strip()] # Parses comma-separated genres.
            # Return the first part if any parts were found, otherwise return the original string 'x'.
            return parts[0] if parts else x # Returns the first parsed genre or original string if parsing fails.
        # Check if the string contains a semicolon, indicating a semicolon-separated list of genres.
        if ";" in x: # Identifies semicolon-separated genre strings.
            # Split the string by semicolon and return the first part, stripped of whitespace.
            return x.split(";")[0].strip() # Extracts the first genre from a semicolon-separated string.
        # Check if the string contains a comma and has more than one part when split by comma (to avoid single words with a comma).
        if "," in x and len(x.split(","))>1: # Identifies comma-separated genre strings with multiple entries.
            # Split the string by comma and return the first part, stripped of whitespace.
            return x.split(",")[0].strip() # Extracts the first genre from a comma-separated string.
        # If none of the specific patterns matched, return the string stripped of leading/trailing whitespace.
        return x.strip() # Returns the cleaned string if no specific pattern applies.
    # For any other data type (e.g., numbers), convert it to a string and return it.
    return str(x) # Converts non-string/list types to string for consistency.

# Apply the 'extract_primary_genre' function to the 'genre' column and store the result in a new 'genre_clean' column.
df["genre_clean"] = df["genre"].apply(extract_primary_genre) # Populates the 'genre_clean' column with processed genre data.

# This section identifies and filters the DataFrame to focus on a predefined number of top genres.
# Focus on top N genres
topN = 8 # Define the number of top genres to select for analysis.
# Calculate the value counts of 'genre_clean', get the 'topN' largest counts, and convert their index (genre names) to a list.
top_genres = df["genre_clean"].value_counts().nlargest(topN).index.tolist() # Gets a list of the most frequent genres.
# Print the list of identified top genres to the console.
print("Top genres:", top_genres) # Displays the genres chosen for detailed analysis.

# Check if 'Unknown' is the only top genre, indicating limited or no distinct genre information.
if len(top_genres) == 1 and top_genres[0] == 'Unknown': # Checks for the specific case where all top genres are 'Unknown'.
    # Print a warning message to the user about the limited nature of genre analysis.
    print("Warning: Only 'Unknown' genre found in top categories. Genre-wise analysis will be limited.") # Informs the user about data quality.
    # Print an additional note explaining why ANOVA results will be NaN under these circumstances.
    print("ANOVA results will likely be NaN because ANOVA requires at least two distinct groups for comparison.") # Explains the consequence for statistical tests.

# Filter the original DataFrame 'df' to include only rows whose 'genre_clean' is in the 'top_genres' list, and create a copy.
df_top = df[df["genre_clean"].isin(top_genres)].copy() # Creates a subset of the DataFrame focusing on top genres.

# This section defines the audio features to be analyzed and ensures they exist in the filtered DataFrame.
# Features to analyze
features = ['energy','danceability','tempo','valence','acousticness','instrumentalness','loudness'] # Define a list of audio features.
# Filter the 'features' list to ensure only those columns actually present in 'df_top' are used.
features = [f for f in features if f in df_top.columns] # Ensures all features for analysis exist in the DataFrame.

# This section creates a pivot table summarizing mean feature values by decade and genre, and saves it.
# Create pivot: decade x genre mean values
# Group the 'df_top' DataFrame by 'decade' and 'genre_clean', then calculate the mean for each 'feature', and reset the index.
genre_decade = df_top.groupby(['decade','genre_clean'])[features].mean().reset_index() # Computes average feature values per decade and genre.
# Save the 'genre_decade' DataFrame to a CSV file in the specified tables directory, without including the DataFrame index.
genre_decade.to_csv(os.path.join(TABDIR,"genre_decade_summary.csv"), index=False) # Stores the summarized data.
# Print a confirmation message that the summary CSV has been saved.
print("Saved genre_decade_summary.csv") # Confirms file creation.

# This section generates heatmaps for each audio feature, showing mean values across decades and genres.
# Heatmaps per feature (decade rows x genre columns)
# Loop through each feature in the 'features' list.
for feat in features: # Iterates over each audio feature for plotting.
    # Create a pivot table for the current feature, with 'decade' as index, 'genre_clean' as columns, and feature values.
    pivot = genre_decade.pivot(index='decade', columns='genre_clean', values=feat) # Reshapes data for heatmap.
    # Create a new matplotlib figure with a specified size (width=10 inches, height=6 inches).
    plt.figure(figsize=(10,6)) # Initializes a new plot canvas.
    # Generate a heatmap using seaborn, displaying annotations, formatted to two decimal places, with a 'vlag' colormap, centered around the overall mean.
    sns.heatmap(pivot, annot=True, fmt=".2f", cmap="vlag", center=pivot.mean().mean()) # Creates the heatmap visualization.
    # Set the title of the plot, including the current feature name.
    plt.title(f"{feat} â€” Decade x Genre Mean") # Adds a descriptive title to the plot.
    # Set the label for the x-axis.
    plt.xlabel("Genre") # Labels the horizontal axis.
    # Set the label for the y-axis.
    plt.ylabel("Decade") # Labels the vertical axis.
    # Define the full file path for saving the figure.
    fpath = os.path.join(FIGDIR, f"genre_heatmap_{feat}.png") # Constructs the save path for the figure.
    # Adjust plot parameters for a tight layout, preventing labels from overlapping.
    plt.tight_layout() # Optimizes plot element spacing.
    # Save the current figure to the specified file path with a high resolution (220 DPI).
    plt.savefig(fpath, dpi=220) # Saves the generated plot as an image file.
    # Close the current plot figure to free up memory.
    plt.close() # Closes the plot to prevent it from being displayed in memory unnecessarily.

# This section generates line plots for each audio feature, showing its trend over decades, separated by genre.
# Line plots: feature over decades by genre
# Loop through each feature in the 'features' list.
for feat in features: # Iterates over each audio feature for plotting trends.
    # Create a new matplotlib figure with a specified size (width=10 inches, height=6 inches).
    plt.figure(figsize=(10,6)) # Initializes a new plot canvas.
    # Generate a line plot using seaborn, showing the feature 'feat' over 'decade', with different lines for each 'genre_clean', and markers at data points.
    sns.lineplot(data=genre_decade, x='decade', y=feat, hue='genre_clean', marker='o') # Creates the line plot visualization.
    # Set the title of the plot, including the current feature name.
    plt.title(f"{feat} over decades (by genre)") # Adds a descriptive title to the plot.
    # Set the label for the x-axis.
    plt.xlabel("Decade") # Labels the horizontal axis.
    # Set the label for the y-axis (using the feature name).
    plt.ylabel(feat) # Labels the vertical axis.
    # Add a legend to the plot, positioned outside the plot area to prevent overlap with data.
    plt.legend(title="Genre", bbox_to_anchor=(1.05,1)) # Displays the legend for genres.
    # Define the full file path for saving the figure.
    fpath = os.path.join(FIGDIR, f"genre_line_{feat}.png") # Constructs the save path for the figure.
    # Adjust plot parameters for a tight layout, preventing labels from overlapping.
    plt.tight_layout() # Optimizes plot element spacing.
    # Save the current figure to the specified file path with a high resolution (220 DPI).
    plt.savefig(fpath, dpi=220) # Saves the generated plot as an image file.
    # Close the current plot figure to free up memory.
    plt.close() # Closes the plot to prevent it from being displayed in memory unnecessarily.

# This section generates violin plots for selected features to visualize their distribution across different genres.
# Violin plots for distribution comparisons (select a few features)
# Loop through a predefined list of features ('energy', 'danceability', 'valence') for violin plots.
for feat in ['energy','danceability','valence']: # Iterates over specific audio features for distribution plotting.
    # Check if the current feature exists as a column in the 'df_top' DataFrame.
    if feat not in df_top.columns: # Verifies the feature's existence in the DataFrame.
        # If the feature does not exist, skip to the next iteration of the loop.
        continue # Proceeds to the next feature if the current one is missing.
    # Create a new matplotlib figure with a specified size (width=12 inches, height=6 inches).
    plt.figure(figsize=(12,6)) # Initializes a new plot canvas.
    # Generate a violin plot using seaborn, showing the distribution of 'feat' across 'genre_clean', with width scaled by density and quartiles shown inside.
    sns.violinplot(data=df_top, x='genre_clean', y=feat, density_norm='width', inner='quartile') # Creates the violin plot visualization, fixing FutureWarning.
    # Set the title of the plot, including the current feature name.
    plt.title(f"{feat} distribution by genre") # Adds a descriptive title to the plot.
    # Set the label for the x-axis.
    plt.xlabel("Genre") # Labels the horizontal axis.
    # Set the label for the y-axis (using the feature name).
    plt.ylabel(feat) # Labels the vertical axis.
    # Rotate the x-axis tick labels by 45 degrees for better readability, especially with long genre names.
    plt.xticks(rotation=45) # Orients x-axis labels to prevent overlap.
    # Define the full file path for saving the figure.
    fpath = os.path.join(FIGDIR, f"genre_violin_{feat}.png") # Constructs the save path for the figure.
    # Adjust plot parameters for a tight layout, preventing labels from overlapping.
    plt.tight_layout() # Optimizes plot element spacing.
    # Save the current figure to the specified file path with a high resolution (220 DPI).
    plt.savefig(fpath, dpi=220) # Saves the generated plot as an image file.
    # Close the current plot figure to free up memory.
    plt.close() # Closes the plot to prevent it from being displayed in memory unnecessarily.

# This section performs a one-way ANOVA test for each feature across different genres to assess statistical differences.
# Statistical test: ANOVA across genres for each feature (decade aggregated or global)
anova_results = [] # Initialize an empty list to store the results of the ANOVA tests.
# Loop through each feature in the 'features' list.
for feat in features: # Iterates over each audio feature for statistical testing.
    # Group the 'df_top' DataFrame by 'genre_clean' and extract the non-null values of the current feature for each group.
    groups = [group[feat].dropna().values for name, group in df_top.groupby('genre_clean')] # Prepares data for ANOVA, grouping by genre.
    # Use a try-except block to handle potential errors during ANOVA calculation, such as having fewer than two groups.
    try: # Begins a block of code where exceptions might occur.
        # Perform a one-way ANOVA test on the 'groups' data. This returns the F-statistic and p-value.
        F, p = stats.f_oneway(*groups) # Executes the ANOVA statistical test.
    # Catch any exception that occurs during the ANOVA calculation.
    except Exception as e: # Catches any error during the try block.
        # If an error occurs, assign NaN (Not a Number) to both F-statistic and p-value.
        F, p = np.nan, np.nan # Sets results to NaN if ANOVA cannot be performed (e.g., only one group).
    # Append a dictionary containing the feature, F-statistic, and p-value to the 'anova_results' list.
    anova_results.append({'feature':feat, 'F_stat':F, 'p_value':p}) # Stores the outcome of each ANOVA test.

# Convert the list of ANOVA results into a pandas DataFrame.
anova_df = pd.DataFrame(anova_results) # Creates a DataFrame from the collected ANOVA results.
# Save the 'anova_df' DataFrame to a CSV file in the specified tables directory, without including the DataFrame index.
anova_df.to_csv(os.path.join(TABDIR,"genre_anova_results.csv"), index=False) # Stores the ANOVA results in a CSV.
# Print a confirmation message that the ANOVA results CSV has been saved.
print("Saved ANOVA results.") # Confirms file creation.

# This section saves a smaller, filtered dataset for potential use in a report.
# Save smaller dataset for report
outpath = "/content/spotify_genre_summary.csv" # Define the output file path for the summarized dataset.
# Save the 'df_top' DataFrame to a CSV file at the specified path, without including the DataFrame index.
df_top.to_csv(outpath, index=False) # Stores the filtered dataset.
# Print a confirmation message that the summary CSV has been saved, including its path.
print("Saved:", outpath)

Loaded: (41106, 11)
Top genres: ['Unknown']
ANOVA results will likely be NaN because ANOVA requires at least two distinct groups for comparison.
Saved genre_decade_summary.csv
Saved ANOVA results.
Saved: /content/spotify_genre_summary.csv
