<a href="https://colab.research.google.com/github/slazur83/Tableau/blob/main/MusicTracks_exporter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import shutil
import glob
import csv
import datetime as dt
from datetime import datetime

import pandas as pd
import requests
from google.colab import files

In [None]:
def check_duplicate_rows(df):
    duplicates = df[df.duplicated()]
    if duplicates.empty:
        return "No duplicates found in the data."
    else:
        return f"Found {len(duplicates)} duplicate rows."


def check_duplicate_rows_without_column(df, column_to_exclude):
    df_copy = df.drop(columns=column_to_exclude)
    duplicates = df_copy[df_copy.duplicated()]

    if duplicates.empty:
        num_removed=0
        return f"No duplicates found in the data, excluding the specified column {column_to_exclude}).", df
    else:
        print(f"Found {len(duplicates)} duplicate rows (excluding the specified column {column_to_exclude}).")
        question = input("Do you want to delete replicated rows? Y/N\n")
        if question.upper() == "Y":
            df_cleaned = df[~df_copy.duplicated(keep='first')]
            return df_cleaned
        else:
            return "No rows have been removed"


def format_date(date_str):
    # Function to format date strings with or without seconds
    if date_str == 'N/A':
        return 'N/A'

    try:
        # Try parsing with seconds first
        return datetime.strptime(date_str, '%d %b %Y, %H:%M:%S').strftime('%Y-%m-%d %H:%M')
    except ValueError:
        # If parsing fails, try without seconds
        try:
            return datetime.strptime(date_str, '%d %b %Y, %H:%M').strftime('%Y-%m-%d %H:%M')
        except ValueError:
            # Handle cases where the date string might not match expected formats
            print(f"Date format error for: {date_str}")
            return 'Invalid Date Format'


def find_highest_version_file(directory, base_filename):
    # Pattern to match files like 'lastfm_tracks.csv'
    pattern = re.compile(rf'{re.escape(base_filename)}\.(csv)')
    highest_version = -1
    selected_file = None

    for filename in os.listdir(directory):
        match = pattern.match(filename)
        if match:
            selected_file = filename

    return selected_file


def convert_to_datetime(date_str):
    # Handle date parsing with different formats
    try:
        return pd.to_datetime(date_str, format='%Y-%m-%d %H:%M', errors='coerce')
    except ValueError:
        print(f"Date format error for: {date_str}")
        return pd.NaT


def extract_artist(subtitles):
    if isinstance(subtitles, list) and len(subtitles) > 0:
        return subtitles[0].get('name', '').split(' - ')[0]
    return ''


def extract_song_title(title):
    if title.startswith("Obejrzano: "):
        return title.replace("Obejrzano: ", "")
    return title

In [None]:
drive.mount('/content/drive')
config_file_path = "/content/drive/MyDrive/Skrypty/config.json"

with open(config_file_path, 'r') as config_file:
    config = json.load(config_file)

# **LastFM**

In [None]:
# API credentials
last_fm_api_key = config['last_fm_api_key']
user_name = 'slazur83'

# Base API URL
base_url = "http://ws.audioscrobbler.com/2.0/"
params = {
    'method': 'user.getrecenttracks',
    'user': user_name,
    'api_key': last_fm_api_key,
    'format': 'json',
    'limit': 200,
    'page': 1
}

# CSV file setup
base_filename = "lastfm_tracks.csv"
directory = '.'  # Current directory
drive_folder = '/content/drive/MyDrive/Dane z aplikacji/LastFM/'

# Find the file with the fixed name
csv_file = find_highest_version_file(directory, base_filename)

if csv_file is None:
    # If no file found, use base_filename directly
    csv_file = base_filename

# Open CSV file for writing
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['artist', 'track', 'album', 'playback_date'])  # Header row

    while True:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()

        tracks = data.get('recenttracks', {}).get('track', [])
        if not tracks:
            break

        for track in tracks:
            artist = track.get('artist', {}).get('#text', 'N/A')
            track_name = track.get('name', 'N/A')
            album = track.get('album', {}).get('#text', 'N/A')
            playback_date = format_date(track.get('date', {}).get('#text', 'N/A'))

            writer.writerow([artist, track_name, album, playback_date])

        params['page'] += 1

print(f"Data written to {csv_file}")

# Move the file to the Google Drive folder
shutil.copy(csv_file, drive_folder)
print(f"The file {csv_file} has been moved to {drive_folder}.")

# Read the CSV file into a dataframe
df_lastfm = pd.read_csv(drive_folder + csv_file, header=0)  # Use header=0 to skip the header row
df_lastfm.columns = ['Artist', 'Track', 'Album', 'Date']  # Rename columns

def convert_to_datetime(date_str):
    # Handle date parsing with different formats
    try:
        return pd.to_datetime(date_str, format='%Y-%m-%d %H:%M', errors='coerce')
    except ValueError:
        print(f"Date format error for: {date_str}")
        return pd.NaT

# Convert the Date column to datetime format
df_lastfm['Date'] = df_lastfm['Date'].apply(convert_to_datetime)

# Convert the Date column to the desired format
df_lastfm['Date'] = df_lastfm['Date'].dt.strftime('%Y-%m-%d %H:%M')
df_lastfm['Source'] = 'LastFM'
df_lastfm['Account'] = 'slazur83'


Data written to lastfm_tracks.csv
The file lastfm_tracks.csv has been moved to /content/drive/MyDrive/Dane z aplikacji/LastFM/.


# **YouTube Music**

In [None]:
# Path to the source file
source_file = '/content/drive/MyDrive/Dane z aplikacji/Google/riwanna85/YouTube i YouTube Music/historia/historia oglądania.json'

# Load the JSON file into a DataFrame
df_ytmusic = pd.read_json(source_file, encoding='utf-8')

# Filter rows for YouTube Music
df_ytmusic = df_ytmusic[df_ytmusic['header'] == 'YouTube Music'].copy()

# Extract artist, track, and date information
df_ytmusic['Artist'] = df_ytmusic['subtitles'].apply(extract_artist)
df_ytmusic['Track'] = df_ytmusic['title'].apply(extract_song_title)
df_ytmusic['Date'] = pd.to_datetime(df_ytmusic['time'], format='ISO8601').dt.strftime('%Y-%m-%d %H:%M')

# Add additional columns
df_ytmusic['Source'] = 'YouTube Music'
df_ytmusic['Account'] = 'riwanna85'
df_ytmusic['Duration'] = 'N/A'

# Reorder columns
df_ytmusic = df_ytmusic[['Artist', 'Track', 'Date', 'Duration', 'Source', 'Account']]

# **Spotify**

In [None]:
def load_spotify_data(file_pattern, account_email):
    # File path to the JSON files
    file_list = glob.glob(file_pattern)
    df_list = []

    for filename in file_list:
        with open(filename, encoding='utf-8') as inputfile:
            df = pd.read_json(inputfile)
            df_list.append(df)

    df_combined = pd.concat(df_list, ignore_index=True)
    df_combined['Account'] = account_email

    print(f'There are {len(df_combined)} rows in a DataFrame for {account_email}.')
    check_duplicate_rows(df=df_combined)
    return df_combined

# Load data for both accounts
df1 = load_spotify_data('/content/drive/MyDrive/Dane z aplikacji/Spotify/slazur83@gmail.com/MyData/StreamingHistory*.json', 'slazur83')
df2 = load_spotify_data('/content/drive/MyDrive/Dane z aplikacji/Spotify/zethar182@gmail.com/MyData/StreamingHistory*.json', 'zethar182')

# Combine data from both accounts
df_spotify = pd.concat([df1, df2], ignore_index=True)
df_spotify['Source'] = 'Spotify'

print(f'There are {len(df_spotify)} rows in the combined DataFrame.')

# Remove duplicate rows, excluding the 'Account' column
df_spotify = df_spotify.drop_duplicates(subset=df_spotify.columns.difference(['Account']))

# Rename columns in the dataframe (using a copy to avoid SettingWithCopyWarning)
df_spotify = df_spotify.rename(columns={'endTime': 'Date', 'artistName': 'Artist', 'trackName': 'Track', 'msPlayed': 'Duration'})

# Reorder columns in the dataframe
df_spotify = df_spotify[['Artist', 'Track', 'Date', 'Duration', 'Source', 'Account']]

print(f'There are {len(df_spotify)} rows in the final DataFrame after removing duplicates.')

There are 20571 rows in a DataFrame for slazur83.
There are 9692 rows in a DataFrame for zethar182.
There are 30263 rows in the combined DataFrame.
There are 20646 rows in the final DataFrame after removing duplicates.


# **Deezer**

In [None]:
import pandas as pd

# File path to the Excel file
deezer_file = '/content/drive/MyDrive/Dane z aplikacji/Deezer/4519420622.xlsx'

# Read the specified sheet from the Excel file into a DataFrame
df_deezer = pd.read_excel(deezer_file, sheet_name="10_listeningHistory")

# Rename columns in the DataFrame
df_deezer = df_deezer.rename(columns={
    'Song Title': 'Track',
    'Album Title': 'Album',
    'Listening Time': 'Duration',
    'Platform Name': 'Platform'
})

# Delete the 'ISRC' column from the DataFrame
df_deezer = df_deezer.drop(columns=['ISRC'])

# Add new columns to the DataFrame
df_deezer['Source'] = 'Deezer'
df_deezer['Account'] = 'slazur83'

**Final DataFrame**

In [None]:
# Concatenate the DataFrames and verify successful merge
merged = pd.concat([df_spotify, df_ytmusic, df_lastfm, df_deezer], ignore_index=True)
total_length = len(df_spotify) + len(df_ytmusic) + len(df_lastfm) + len(df_deezer)

assert len(merged) == total_length, "Not all data is merged"
print('All data is merged')

# Handle null values in the 'Date' column
nulls = merged['Date'].isna().sum()

if nulls > 0:
    if nulls < 15:
        merged.dropna(subset=['Date'], inplace=True)
        print(f'Dropped {nulls} rows with missing Date values.')
    else:
        raise ValueError(f'There are {nulls} null values in the Date column that need attention.')
else:
    print('No null values found in the Date column.')

# Reorder columns and format the 'Date' column
columns_order = ['Date', 'Artist', 'Track', 'Album', 'Duration', 'Source', 'Account', 'Platform', 'Platform Model', 'IP Address']
merged = merged[columns_order]

# Convert 'Date' column to datetime format
merged['Date'] = pd.to_datetime(merged['Date'])
merged.sort_values(by='Date', inplace=True)

# Print the date range for each dataframe
print(f"Spotify: {df_spotify['Date'].min()} to {df_spotify['Date'].max()}")
print(f"Deezer: {df_deezer['Date'].min()} to {df_deezer['Date'].max()}")
print(f"LastFM: {df_lastfm['Date'].min()} to {df_lastfm['Date'].max()}")
print(f"YouTube Music: {df_ytmusic['Date'].min()} to {df_ytmusic['Date'].max()}")

All data is merged
No null values found in the Date column.
Spotify: 2022-10-26 18:09 to 2023-10-09 19:53
Deezer: 2021-08-06 17:11:48 to 2023-03-05 19:16:53
LastFM: 2021-12-23 07:36 to 2024-08-16 14:51
YouTube Music: 2024-03-10 18:13 to 2024-08-05 16:20


In [None]:
# Export the DataFrame to CSV
output_path = '/content/drive/MyDrive/Skrypty/Tableau/Outputs/music_tracks.csv'
merged.to_csv(output_path, index=False)
print(f'Data successfully exported to {output_path}')

Data successfully exported to /content/drive/MyDrive/Skrypty/Tableau/Outputs/music_tracks.csv
