In [8]:
import pandas as pd

In [9]:
# This script creates a dataset from the wikipedia dataset that can be used to create a graph

df = pd.read_csv('wikipedia_music.csv')

# Filter rows with non-null 'MENTIONED_ARTISTS'
df = df.dropna(subset=['MENTIONED_ARTISTS'])

# Extract relevant columns
artist_names = df['ARTIST_NAME'].values
mentioned_artists = df['MENTIONED_ARTISTS'].values

total_rows = len(df)

# Initialize a list to store results
data = []

In [12]:
# Process data efficiently
for artist_name, mentioned_artists_str in zip(artist_names, mentioned_artists):
    try:
        # Split the string into a list of artists and counts
        mentioned_artists_and_counts = mentioned_artists_str.split(';')
        for mentioned_artist_and_count in mentioned_artists_and_counts:
            # Split the artist name and the count
            mentioned_artist, count = mentioned_artist_and_count.rsplit(':', 1)
            data.append((artist_name, mentioned_artist, int(count)))
    except Exception as e:
        print(f"Error processing: {mentioned_artists_str}")
        print(e)

In [11]:
# Convert results to a DataFrame
out_df = pd.DataFrame(data, columns=['source', 'target', 'number_of_mentions'])

# Remove duplicates and aggregate counts
out_df = out_df.groupby(['source', 'target'], as_index=False).sum()

# Print results
print(out_df.head())
print(f'Number of rows: {len(out_df)}')

  source           target  number_of_mentions
0    !!!      Cake (band)                   1
1    !!!  LCD Soundsystem                   1
2    !!!        Nate Dogg                   1
3    !!!        Nic Offer                   6
4    !!!          Out Hud                   3
Number of rows: 280923


In [13]:
# Output the DataFrame to a CSV file
out_df.to_csv('wikipedia_music_graph.csv', index=False)