# Repertoire

In [None]:
try:
  from google.colab import drive
  drive.mount('/gdrive')
except:
  print("vous n'êtes encore  sur Google Colab :(")

Mounted at /gdrive


In [None]:
REP_PROJET = '/gdrive/My Drive/Colab Notebooks/Dedoublement/'
REP_INPUT  = REP_PROJET + 'Input/'
REP_INTERMED  = REP_PROJET + 'Intermed/'
REP_OUTPUT  = REP_PROJET + 'Output/'

# MODULES

In [None]:
%cd '/gdrive/My Drive/Colab Notebooks/Dedoublement/'
!pwd

/gdrive/My Drive/Colab Notebooks/Dedoublement
/gdrive/My Drive/Colab Notebooks/Dedoublement


In [None]:
%%capture
! pip install -r requirements.txt

In [None]:
import numpy as np
import pandas as pd
import pickle
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from functools import partial

# FUNCTIONS

In [None]:
%run "00_functions.ipynb"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Inputs Discogs

Merge the extraction and aggregate data to artist + title level

In [None]:
baseline = pd.read_pickle(REP_INTERMED + 'baseline.sav').drop([	'released_date'], axis=1)
baseline['country'] = baseline['country'].str.title()

In [None]:
print(" frequence de country manquant en %: " ,round(baseline[baseline['country'] == 'Nan'].shape[0]/baseline.shape[0]),2)

 frequence de country manquant en %:  0 2


In [None]:
baseline['country'].value_counts().reset_index()

Unnamed: 0,index,country
0,Us,1838701
1,Uk,897272
2,Germany,674060
3,France,423732
4,Italy,350008
...,...,...
189903,Uk ¤ Canada ¤ Us ¤ Australia ¤ New Zealand ¤ G...,1
189904,Germany ¤ Us ¤ Scandinavia ¤ Canada ¤ Italy ¤ ...,1
189905,Us ¤ Uk ¤ Canada ¤ Netherlands ¤ New Zealand ¤...,1
189906,Uk ¤ Us ¤ Canada ¤ Australia ¤ Germany ¤ Brazi...,1


# Recuperer l'ensemble des pays

In [None]:
list_temp = baseline['country'].unique().tolist()
list_temp = [ country.replace('Usa','Us') for country in list_temp]

In [None]:
elements = [ (country.replace('¤' , ',').replace('&' , ',').replace(';' , ',').replace(' ,', ',').replace(', ', ',')).split(',')for country in list_temp ]

list_country  = []
for element in  elements :
  for mot in element :
    list_country.append(mot)
list_country = set(list_country)


In [None]:
list_country.remove('')
#list_country.remove('Nan')

In [None]:
#with open(REP_INTERMED + 'list_country.pkl','wb') as f :
#     pickle.dump(list_country, f)

**!!!!!!!!!!! la cellule sert à batcher les index du dataframe cdandlp par pays! Pas besoin de l'exécution une deuxième fois car l'output sera sauvégarder en pkl pour faciliter sa lecture** Temps d'exécution 2 min

In [None]:
# list_baseline_country =[ baseline[baseline['country'].isin([i, 'Nan'])]  for i in list_country ]
# with open(REP_INTERMED + 'list_baseline_country.pkl','wb') as f :
#      pickle.dump(list_baseline_country, f)

# Inputs CDandLP

Scope : 
+ on articles for which we do not know the correspondence on the discogs (id_release=0) 
+ on articles where a sql query can be used to perfectly match discogs and cdandlp data

## Données brutes

In [None]:
import pandas as pd
#to_drop = ['n_ident',	'n_categ1',	'n_categ2',	'code_upload_not_null',	'courtdesc_lg1',	'courtdesc_lg2',	'format'	,'annee',	'label']
df = pd.read_pickle(REP_INTERMED + 'base_cdandlp.sav')
df['text']  = df['artiste']  + ' ' +  df['titre']
df.head()

Unnamed: 0,n_ref,artiste,titre,pressage,text
0,113543790,Francis Cabrel,Les Chemins De Traverse Canada,canada,Francis Cabrel Les Chemins De Traverse Canada
1,119486700,Rina Ketty,La Madone Aux Fleurs+3,,Rina Ketty La Madone Aux Fleurs+3
2,117183052,Patrick Moraz,Future Memories : Patrick Moraz Live On Tv,67.435 - france,Patrick Moraz Future Memories : Patrick Moraz ...
3,113717854,Anal Vomit,Demoniac Flagellations,MAP009 - Peru,Anal Vomit Demoniac Flagellations
4,115051043,Eddie Palmieri,Sueño,USA,Eddie Palmieri Sueño


In [None]:
print(" frequence de pressage manquant en : " ,round(df[df['pressage'].isna()].shape[0]/df.shape[0],2))

 frequence de pressage manquant en :  0.17


## Data cleaning

In [None]:
df['pressage'] = df['pressage'].str.title()
df['pressage'] = df['pressage'].str.strip()
df.loc[df['pressage'].isna(), 'pressage'] = 'Vide'
df = recode_pressage(df)



## Dict of Dataframes

In [None]:
dict_country_baseline = {}
for country, artist, title, text, master_id in zip(baseline['country'], baseline['artist'], baseline['title'], baseline['text_CLEAN'], baseline['master_id']):
    if country in list_country:
        if country not in dict_country_baseline:
            dict_country_baseline[country] = [ (country, artist, title, text, master_id) ]
        else:
            liste_temp = dict_country_baseline[country]
            liste_temp.append((country, artist, title, text, master_id))
            dict_country_baseline[country] = liste_temp

with open(REP_INTERMED + 'dict_country_baseline.pkl', 'wb') as f:
  pickle.dump(dict_country_baseline, f)  

In [None]:
list_country.remove('Nan')
dict_country_cdandlp = {}
for pressage, text, n_ref, artiste, titre in zip(df['pays_clean'], df['text'], df['n_ref'], df['artiste'], df['titre']):
    if pressage in list_country:
        if pressage not in dict_country_cdandlp:
            dict_country_cdandlp[pressage] = [ (pressage,text,n_ref, artiste, titre) ]
        else:
            liste_temp = dict_country_cdandlp[pressage]
            liste_temp.append((pressage,text, n_ref, artiste, titre))
            dict_country_cdandlp[pressage] = liste_temp  

with open(REP_INTERMED + 'dict_country_cdandlp.pkl', 'wb') as f:
  pickle.dump(dict_country_cdandlp, f)     

**!!!!!!! The cell below write outputs in write**

In [None]:
df_items = pd.concat(
                 [ pd.DataFrame(dict_country_baseline['Nan'], columns=col_baseline), 
                   pd.DataFrame(dict_country_baseline['France'],columns=col_baseline)
                 ], ignore_index=True)

df_sample= pd.DataFrame(dict_country_cdandlp['France'], columns=col_cdandlp)

answer = knn_country(df_items, df_sample)

# Exécutions : prochaine passage

Les codes ci-après permettent de recharger les resultats des étapes précedentes sans avoir à compiler les cellules précedentes.

In [None]:
# tables par pays
col_baseline  = ['country', 'artist', 'title', 'text_CLEAN', 'master_id']
col_cdandlp = ['pays_clean', 'text', 'n_ref', 'artiste', 'titre']

with open(REP_INTERMED + 'dict_country_cdandlp.pkl', 'rb') as f:
  dict_country_cdandlp = pickle.load(f)  


with open(REP_INTERMED + 'dict_country_baseline.pkl', 'rb') as f:
  dict_country_baseline = pickle.load(f)  


df_items = pd.concat(
                      [ pd.DataFrame(dict_country_baseline['France'],columns=col_baseline),
                        pd.DataFrame(dict_country_baseline['Nan'], columns=col_baseline)]
                     , ignore_index=True)

df_sample= pd.DataFrame(dict_country_cdandlp['France'], columns=col_cdandlp)

# fichier resultats
with open(REP_INTERMED + df_items.loc[0,'country'] + '_results_sim.pkl', 'rb') as f:
        answer = pickle.load(f) 

:

In [None]:
tmp = pd.concat(answer)
tmp.head()

Unnamed: 0,id_query,master_id,artist,title,similarity,rank,levenshtein,jaro_winkler,jaccard,overlap,hamming,fuzzy_partial
0,117183052,117183052,Patrick Moraz,Future Memories : Patrick Moraz Live On Tv,-1.0,0,1.0,1.0,1.0,1.0,1.0,1.0
1,117183052,1306989,Patrick Fiori,4 Mots,65.678958,1,0.485714,0.692696,0.346154,0.947368,0.176471,0.63
2,117183052,0,Tripis,Crazy Memories,65.678958,2,0.405797,0.622004,0.352941,1.0,0.019608,0.56
3,117183052,0,Patrick Forgas,Monks,63.564173,3,0.463768,0.828023,0.326923,0.944444,0.215686,0.67
4,117183052,0,Patrick Fiori - Patrick Bruel,Corsica,63.157959,4,0.581395,0.822409,0.535714,0.857143,0.215686,0.6
