In [1]:
import pandas as pd
import os

In [6]:
path_to_artists = r'C:\Users\annes\OneDrive\Bureau\data\artists.csv'
path_to_images = r"C:\Users\annes\OneDrive\Bureau\data\resized"

In [8]:
# on crée les deux dataframes
df_artists = pd.read_csv(path_to_artists, index_col=0)
df_filenames = pd.DataFrame({'filename':[t for t in os.listdir(path_to_images) if 'ipynb' not in t ]})

In [9]:
# on recrée le nom des artistes dans le dataset filename
df_filenames['artist'] = df_filenames['filename'].apply(lambda f: '_'.join(f.split('_')[:-1]))

In [10]:
# on crée un dictionnaire qui contient le nom des artistes par ordre alphabetique
ranks = {artist: rank for rank, artist in enumerate(sorted(df_filenames['artist'].unique()))}

In [11]:
# on crée une colonne rank avec le rang de l'artiste par ordre alphabétique
df_filenames['rank'] = df_filenames['artist'].apply(ranks.get)
df_filenames.head()

Unnamed: 0,filename,artist,rank
0,Albrecht_Dürer_1.jpg,Albrecht_Dürer,0
1,Albrecht_Dürer_10.jpg,Albrecht_Dürer,0
2,Albrecht_Dürer_100.jpg,Albrecht_Dürer,0
3,Albrecht_Dürer_101.jpg,Albrecht_Dürer,0
4,Albrecht_Dürer_102.jpg,Albrecht_Dürer,0


In [12]:
# crée la même colonne avec le jeu de données sur les artistes
df_artists['rank'] = df_artists['name'].rank().astype(int)-1
df_artists.sort_values('rank').head()

Unnamed: 0_level_0,name,years,genre,nationality,bio,wikipedia,paintings,rank
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
19,Albrecht Dürer,1471 - 1528,Northern Renaissance,German,Albrecht Dürer (; German: [ˈʔalbʁɛçt ˈdyːʁɐ]; ...,http://en.wikipedia.org/wiki/Albrecht_Dürer,328,0
20,Alfred Sisley,1839 - 1899,Impressionism,"French,British",Alfred Sisley (; French: [sislɛ]; 30 October 1...,http://en.wikipedia.org/wiki/Alfred_Sisley,259,1
0,Amedeo Modigliani,1884 - 1920,Expressionism,Italian,Amedeo Clemente Modigliani (Italian pronunciat...,http://en.wikipedia.org/wiki/Amedeo_Modigliani,193,2
7,Andrei Rublev,1360 - 1430,Byzantine Art,Russian,"Andrei Rublev (Russian: Андре́й Рублёв, IPA: [...",http://en.wikipedia.org/wiki/Andrei_Rublev,99,3
45,Andy Warhol,1928 – 1987,Pop Art,American,"Andy Warhol (; born Andrew Warhola; August 6, ...",https://en.wikipedia.org/wiki/Andy_Warhol,181,4


In [13]:
# on supprime les colonnes qui ne servent à rien
df_artists = df_artists.drop(labels=['years','nationality','bio','wikipedia','paintings'],axis=1)
# on crée nos colonnes genres
df_artists[['genre_1','genre_2','genre_3']] = df_artists['genre'].str.split(",",expand=True)
df_artists.head()

Unnamed: 0_level_0,name,genre,rank,genre_1,genre_2,genre_3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Amedeo Modigliani,Expressionism,2,Expressionism,,
1,Vasiliy Kandinskiy,"Expressionism,Abstractionism",47,Expressionism,Abstractionism,
2,Diego Rivera,"Social Realism,Muralism",8,Social Realism,Muralism,
3,Claude Monet,Impressionism,7,Impressionism,,
4,Rene Magritte,"Surrealism,Impressionism",43,Surrealism,Impressionism,


In [14]:
# fusion des deux datasets
df_final = pd.merge(left=df_filenames, right=df_artists, left_on=['rank'], right_on=['rank'], how='inner')
df_final.head(20)

Unnamed: 0,filename,artist,rank,name,genre,genre_1,genre_2,genre_3
0,Albrecht_Dürer_1.jpg,Albrecht_Dürer,0,Albrecht Dürer,Northern Renaissance,Northern Renaissance,,
1,Albrecht_Dürer_10.jpg,Albrecht_Dürer,0,Albrecht Dürer,Northern Renaissance,Northern Renaissance,,
2,Albrecht_Dürer_100.jpg,Albrecht_Dürer,0,Albrecht Dürer,Northern Renaissance,Northern Renaissance,,
3,Albrecht_Dürer_101.jpg,Albrecht_Dürer,0,Albrecht Dürer,Northern Renaissance,Northern Renaissance,,
4,Albrecht_Dürer_102.jpg,Albrecht_Dürer,0,Albrecht Dürer,Northern Renaissance,Northern Renaissance,,
5,Albrecht_Dürer_103.jpg,Albrecht_Dürer,0,Albrecht Dürer,Northern Renaissance,Northern Renaissance,,
6,Albrecht_Dürer_104.jpg,Albrecht_Dürer,0,Albrecht Dürer,Northern Renaissance,Northern Renaissance,,
7,Albrecht_Dürer_105.jpg,Albrecht_Dürer,0,Albrecht Dürer,Northern Renaissance,Northern Renaissance,,
8,Albrecht_Dürer_106.jpg,Albrecht_Dürer,0,Albrecht Dürer,Northern Renaissance,Northern Renaissance,,
9,Albrecht_Dürer_107.jpg,Albrecht_Dürer,0,Albrecht Dürer,Northern Renaissance,Northern Renaissance,,


In [15]:
df_final.shape

(8355, 8)

In [16]:
df_filenames.shape

(8355, 3)