Used the xmltodict library to get details of artists' aliases in a dataframe alongside their artist ids and names.
Source: https://python.plainenglish.io/saving-xml-content-to-a-pandas-dataframe-using-xmltodict-b6fab32a5100. Then removed all instances from the edge list where an artist is shown as collaborating with themselves.

In [4]:
#pip install xmltodict

Collecting xmltodict
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.13.0
Note: you may need to restart the kernel to use updated packages.


In [67]:
# to help with decompressing discogs data dump file
import gzip

import xmltodict
import pandas as pd

In [68]:
# import list of artist ids and artist names
df_cleaned = pd.read_csv('df_artists_cleaned.csv', index_col=0)

In [69]:
df_cleaned.head()

Unnamed: 0,artistId,artistName
0,194,Various
1,417,Photek
2,113467,Angela Winbush
3,61671,T-Ski Valley
4,69205,Lady D


In [65]:
#import requests

In [70]:
# parsing the Discogs artists file 
artists_file = xmltodict.parse(gzip.GzipFile('discogs_20180901_artists.xml.gz'))

In [71]:
# refining the data to look at the artists
artists_file = artists_file['artists']['artist']

In [72]:
# first value in artists dataset
artists_file[0]

{'images': {'image': {'@height': '450',
   '@type': 'secondary',
   '@uri': '',
   '@uri150': '',
   '@width': '600'}},
 'id': '1',
 'name': 'The Persuader',
 'realname': 'Jesper Dahlbäck',
 'profile': None,
 'data_quality': 'Correct',
 'namevariations': {'name': ['Persuader', 'The Presuader']},
 'aliases': {'name': [{'@id': '19541', '#text': 'Dick Track'},
   {'@id': '278760', '#text': 'Faxid'},
   {'@id': '16055', '#text': 'Groove Machine'},
   {'@id': '196957', '#text': "Janne Me' Amazonen"},
   {'@id': '239', '#text': 'Jesper Dahlbäck'},
   {'@id': '25227', '#text': 'Lenk'},
   {'@id': '439150', '#text': 'The Pinguin Man'}]}}

In [73]:
# function to get an artist's id, name and alias, and put these in a dataframe
def get_artist_data(artist):
    data= []
    artist_id = artist["id"]
    artist_name = artist["name"]
    try:
        aliases = artist['aliases']['name']
    except:
        aliases = "none available"
    data.append({"id": artist_id, "name": artist_name, "aliases": aliases})
    df = pd.DataFrame(data)
    return df

In [19]:
#get_artist_data(artists_file[0])

In [74]:
# function that takes the xml file and returns a list of the artist dataframes for each artist
def get_all_artists(file):
    df_list = []
    for x in file:        
        df_list.append(get_artist_data(x))
    return df_list

In [75]:
all_artists = get_all_artists(artists_file)

In [58]:
#all_artists[6]

In [76]:
# combine all the artist dataframes into one dataframe
all_artists = pd.concat(all_artists, ignore_index =True, axis=0)

In [77]:
all_artists

Unnamed: 0,id,name,aliases
0,1,The Persuader,"[{'@id': '19541', '#text': 'Dick Track'}, {'@i..."
1,2,Mr. James Barth & A.D.,"[{'@id': '384581', '#text': 'ADCL'}, {'@id': '..."
2,3,Josh Wink,"[{'@id': '370936', '#text': 'Accent (3)'}, {'@..."
3,4,Johannes Heil,"[{'@id': '120636', '#text': 'Age Beats'}, {'@i..."
4,5,Heiko Laux,"[{'@id': '22282', '#text': 'Apathism'}, {'@id'..."
...,...,...,...
5834070,6682187,Katri Roukala,none available
5834071,6682188,Ines Sirén,none available
5834072,6682189,Sauli Nurila,none available
5834073,6682190,Ville Tainio,none available


In [78]:
all_artists.to_csv('artists_aliases.csv')

In [79]:
# load in our file which includes artist ids, artist names, and aliases
df_artists_aliases = pd.read_csv('artists_aliases.csv')

In [80]:
#make a series of the artist ids for those artists we are interested in
series_id = df_cleaned['artistId']

In [81]:
# create a dataframe of the artists with aliases for only those artists we are interested in
df_reconciled = df_artists_aliases[df_artists_aliases['id'].isin(series_id)]

In [82]:
df_reconciled = df_reconciled.drop(columns = 'Unnamed: 0')

In [83]:
df_reconciled

Unnamed: 0,id,name,aliases
373,417,Photek,"[{'@id': '2015', '#text': 'Aquarius'}, {'@id':..."
701,776,DJ Cam,"[{'@id': '1103667', '#text': 'Bouncer Crew'}, ..."
877,968,The Herbaliser,none available
905,997,Roey Marquis II,"[{'@id': '3277539', '#text': 'Calogero Randazz..."
988,1095,Andrés,"[{'@id': '2517289', '#text': 'A Drummer From D..."
...,...,...,...
5768751,6615320,Oilworks,none available
5794809,6641987,DJ Matto (2),none available
5805659,6653120,Lo-Go,none available
5833277,6681383,james joyce (7),none available


In [84]:
# reset the index in the new dataframe
df_reconciled.reset_index(drop=True, inplace=True)

In [85]:
df_reconciled.head()

Unnamed: 0,id,name,aliases
0,417,Photek,"[{'@id': '2015', '#text': 'Aquarius'}, {'@id':..."
1,776,DJ Cam,"[{'@id': '1103667', '#text': 'Bouncer Crew'}, ..."
2,968,The Herbaliser,none available
3,997,Roey Marquis II,"[{'@id': '3277539', '#text': 'Calogero Randazz..."
4,1095,Andrés,"[{'@id': '2517289', '#text': 'A Drummer From D..."


In [35]:
#df_reconciled.to_csv('artists_aliases.csv')

In [86]:
df_edges_names = pd.read_csv('df_edge_list_names.csv')

In [87]:
df_edges_names.head()

Unnamed: 0.1,Unnamed: 0,Names_lists1,Names_lists2
0,0,Lady D,Mr Egg Germ
1,1,Lady D,T Monique
2,2,Mr Egg Germ,T Monique
3,3,T.C. Izlam,The Phunky Alien
4,4,Various,Mood


In [88]:
df_reconciled[df_reconciled['id']==198518]

Unnamed: 0,id,name,aliases
1991,198518,GZA,"[{'@id': '606584', '#text': 'Gary Grice'}, {'@..."


In [90]:
df_edge_count = pd.read_csv('edge_count.csv', index_col=0)

In [91]:
df_edge_count.head()

Unnamed: 0,V1,V2,No times collaborated
3746,198518,242071,12
4828,331629,1065010,11
664,25495,390896,11
3863,209294,79578,9
2966,144758,164246,8


In [93]:
#df_reconciled.loc[df_reconciled['aliases'].str.contains("331629", case=False)]['id']

In [94]:
# checking whether an artist is shown as sharing an edge with an id associated with one of their own aliases
def checkAliasCollaborations(edge_list, edge_count):
    for ind in edge_count.index:
        value = str(edge_count['V1'][ind])
        comparison = df_reconciled.loc[df_reconciled['aliases'].str.contains(str(edge_count['V2'][ind]), case=False)]['id'].to_string(index=False)
        if value == comparison:
        # Get indexes where name column has value from column V1 and value from column V2 where are same artist
            indexNames = edge_list[(edge_list['V1'] == int(value)) & (edge_list['V2'] == int(edge_count['V2'][ind]))].index
        # Delete these row indexes from dataFrame
            edge_list.drop(indexNames , inplace=True)
    return edge_list

In [95]:
df_updated_edges = pd.read_csv('discogs_edges.csv')
print("length of df before: ", len(df_updated_edges))
df_edge_latest = checkAliasCollaborations(df_updated_edges, df_edge_count)
print("length of df after: ", len(df_edge_latest))

length of df before:  9118
length of df after:  8981


In [96]:
# new updated edge list with alias collaborations removed
df_edge_latest.to_csv('df_edges_cleaned.csv')