In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#from matplotlib.ticker import PercentFormatter
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_auc_score , accuracy_score, precision_score, recall_score,  f1_score, roc_curve, auc

# Snowball stemmer was chosen in favor of Porter Stemmer which is a bit more aggressive and tends to remove too much from a word
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download("punkt")
nltk.download("stopwords")


 
# unidecode is the library needed for ASCII folding
from unidecode import unidecode
import string
# Compact Language Detector v3 is a very fast and performant algorithm by Google for language detection: more info here: https://pypi.org/project/pycld3/
import re
import cld3

import pickle as pickle

from textdistance import damerau_levenshtein, jaro_winkler, sorensen_dice, jaccard, overlap, ratcliff_obershelp,hamming
from datetime import datetime
from ngram import NGram
import warnings
import Levenshtein
from fuzzywuzzy import fuzz

# Strategy for text treatment

1. Language detection - for now the default language is English and we switcht to French if cld3 detects it. Should we consider other languages too?
2. Remove punctuation and special characters
3. Tokenization
4. Stop-word removal - stop-word removal is language-based and is done before the stemming, otherwise they might not be detected : If Think here, we can throw it because the text here are two small and the language are differents
5. Stemming - stemming is performed in favor for lemmatization, as we're going to be working mainly with names and not even entire sentences. Since lemmatizing depends on the sentence context, it would not be a good option here.
6. ASCII folding

In [None]:
name_column_blacklist = ["feat", "and", "featuring", "et", "+","&", "vs"]
name_column_regex_replace = {r"\'": "", r"\s+": " "}

In [None]:
STEMMER_EN = SnowballStemmer(language='english')
STEMMER_FR = SnowballStemmer(language='french')

In [None]:
def make_text_prep_func(row, 
                        word_blacklist=name_column_blacklist,
                        regex_replace=name_column_regex_replace, 
                        colonne=None) :
    """
      This function treats the input string by going through the following steps:
      1. Language detection
      2. Remove punctuation and special characters
      3. Tekenization
      4. Stop-word removal
      5. Stemming
      6. ASCII folding
    
      Arguments:
      row {str} -- The input string to be treated.
      
     Returns:
      str -- The treated version of the string. 
    """
    

    if colonne == None :
      s = str(row)
    else :
      s=row[colonne]
    
    # in the default case use the English stop-words and stemmer
    stemmer = STEMMER_EN
    stop_words =  word_blacklist

    
    # convert to lowercase, just to be sure :)
    s = s.lower()
    
    # check if the language is French and switch to the French
    s_lang = cld3.get_language(s)
    if s_lang[0]=="fr":
      stemmer = STEMMER_FR
      stop_words = word_blacklist


    # remove punctuation
    s_clean = s.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))

    # tokenize the string into words
    s_tokens = word_tokenize(s_clean)

    # remove the stop-word tokens

    s_tokens_no_stop = [word for word in s_tokens if word not in stop_words]
    
    # join the stemmed tokens together and ASCII fold
    s_tokens_stemmed = [stemmer.stem(word) for word in s_tokens_no_stop]
    s_ascii = unidecode(" ".join(s_tokens_stemmed))
    
    for regex, replace in regex_replace.items():
      s_ascii = re.sub(regex, replace, s_ascii)

    return(s_ascii.strip())

## Similarity functions

In [None]:
def compound_similarity(row: pd.DataFrame, col1: str, col2: str) :
    
    """This function computes the a compound score measuring the similarity
       between two strings. The score is based on the following 7 metrics:
         - Damerau-Levenshtein - edit distance that also takes in account transpositions.
         - Jaro-Winkler - similarity based on common letters adjusted for the higher likelihood
                       spelling to be correct in the beginning of a string.
         - n-gram - This similarity is based on the counts of n-grams (sequence of substrings 
                   of length n) which are matching. It has been emprirically selected that the length
                of the n-grams in this case is set to N=2.
         - Jaccard - like n-grams without taking into account the cardinality (length) of the
            n-grams. Effectively, this gives n-gram similarity score for N=1.
         - Sorensen-Dice - Similar logic as Jaccard but with slight adjustments.
         - Overlap - measures the 'overlap' between two strings based on the number of common
                    characters in them.
         - Ratcliff-Obershelp - takes into account the length of the fully matching substrings
        but also the number of matching characters from substrings that do not match completely.
        
    Arguments:
      col1 {str} -- The first columns of strings.
      col2 {str} -- The second columns of strings.
     
    Returns:
      float -- The mean of the similarity scores coming from the 7 algorithms. 0 means not similar
        at all and 1 means that the two strings match perfectly. If Either of the two strings are
        empty, the similarity will be treated as 0.
    """
    s1 = row[col1]
    s2 = row[col2]
    
    if s1 is None:
        s1 = ""
    if s2 is None:
        s2 = ""
    if s1 == "" and s2 == "":
        return 0.

    scores = [   Levenshtein.ratio(s1, s2),
                 jaro_winkler.normalized_similarity(s1, s2),
                 jaccard.normalized_similarity(s1, s2),
              
                 overlap.normalized_similarity(s1, s2),
            
                 hamming.normalized_similarity(s1, s2),
                 fuzz.partial_ratio(s1, s2)/100
             ]
  
    return scores

# ML functions

In [None]:
def customize_corr(df: pd.DataFrame) :
    
    """ 
      Customize correlation matrix visually  
      
    Arguments:
        df - dataframe with features
    
    Returns: 
    """
    

    plt.figure(figsize=(16, 10))
    
    # define the mask to set the values in the upper triangle to True
    mask = np.triu(np.ones_like(df.corr()))
    heatmap = sns.heatmap(df.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='magma')
    heatmap.set_title('Lower Correlation Matrix', fontdict={'fontsize':18}, pad=16)

In [None]:
def confusio_matrix(y_test, y_predicted):
    
       """
       A  function to visualize  confusion matrix ,
       A technique for summarizing the performance of a classification algorithm.
   
       Arguments:
        y_test {binary} -- reference labels
        y_predicted {float} -- predicted proba
     
       Returns:
        float -- The mean of the similarity scores coming from the 7 algorithms. 0 means not similar
          at all and 1 means that the two strings match perfectly. If Either of the two strings are
          empty, the similarity will be tre

       """
    
       cm = confusion_matrix(y_test, y_predicted)
       tn, fp, fn, tp = confusion_matrix(y_test, y_predicted).astype(int).ravel()
       print("Accuracy:", round((tp + tn)/(tp+tn+fp+fn),2))
       print("Recall:", round(tp /(tp+fn),2))
       print("precision:", round( tp/(tp+fp),2))
       plt.figure(figsize=(5,5))
       plt.clf()
       plt.imshow(cm, interpolation='nearest',cmap=plt.cm.Wistia)
       classNames = ['Negative','Positive']
       plt.title('Matrice de confusion')
       plt.ylabel('True label')
       plt.xlabel('Predicted label')
       tick_marks = np.arange(len(classNames))
       plt.xticks(tick_marks, classNames, rotation=45)
       plt.yticks(tick_marks, classNames)
       s = [['TN','FP'], ['FN', 'TP']]

       for i in range(2):
          for j in range(2):
              plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]))
       plt.show()

In [None]:
def  courbe_roc(colonne_ref, colonne_model) :
  
    """ 
      A function to plot ROC curve, present AUC and optimal tresholds for  individual assignment.
      By default, in ML algorithm, we assign 1 or 0 based on a threshold of 0.5. 
      This function return the optimal treshold thatminimise false positive rate and maximise true prositive rate
      
    Arguments:
    
        colonne_ref   - column with labels (ref)
        colonne_model - probability of the model 
    
    Returns:
    """
    
    fpr, tpr, thresholds = roc_curve(colonne_ref,colonne_model)
    roc_auc = auc(fpr, tpr)
    optimal_thr = thresholds[np.argmin((0-fpr)**2 + (1-tpr)**2)]
    optimal_tpr = tpr[thresholds==optimal_thr][0]
    optimal_fpr = fpr[thresholds==optimal_thr][0]
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='orange',lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.scatter(optimal_fpr, optimal_tpr, color="red", lw=lw, label=f"Opt. similarity threshold: {optimal_thr}\nOpt. TPR:{round(optimal_tpr, 2)}, Opt. FPR: {round(optimal_fpr, 2)}")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Matching  ROC curve')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
def create_vectorizer_and_vectorized_db(df_items, analyzer='word',ngram_range=(1,1)):

    """
    La fonction retourne la base de référence vectorisée + le vectorize lui même
    
    args : dictionnaire avec les paramètres  : analyzer, ngram_range et use_id pour le vectorizer
    
    """       
    

    #creation du vectorizer et de la base vectorisée
    print("**** creation du vectorizer")
    if analyzer == 'word' :
      vectorizer = TfidfVectorizer(analyzer = 'word', use_idf = False)
      X_train = vectorizer.fit_transform([ document for document in df_items['text_CLEAN'] ])
      
    else :
      vectorizer = TfidfVectorizer(analyzer = 'char', ngram_range=ngram_range , use_idf = False)
      X_train = vectorizer.fit_transform(df_items['text_CLEAN'].str.replace(' ', ''))
    
    #on "exécute" le vectorizer et en sortie on a une matrice sparse + le vocabulaire associé
    print("nb features:", len(vectorizer.get_feature_names()))

    print("**** sauvegarde des fichiers en pickle pour la prochaine fois")
    suffix_gram = re.sub(r"[(,)]", "", str(ngram_range)).replace(' ','_')
    with open(REP_INTERMED + analyzer + '_' + suffix_gram+ '_Xtrain.pkl', 'wb') as  f1 :
      pickle.dump(X_train, f1) 

    with open(REP_INTERMED + analyzer + '_'+  suffix_gram+ '_vectorizer.pkl', 'wb') as f2 :
      pickle.dump(vectorizer, f2) 

In [None]:
def reload_data_and_vectorizer(analyzer = None, ngram_range=None) :

  if analyzer== None and ngram_range==None :
    
     with open(REP_INTERMED + 'Xtrain.pkl', 'rb') as f:
         X_train = pickle.load(f)
    
     with open(REP_INTERMED + 'vectorizer.pkl', 'rb') as handle:
         vectorizer = pickle.load(handle)
  else :

      suffix_gram = re.sub(r"[(,)]", "", str(ngram_range)).replace(' ','_')

      with open(REP_INTERMED + analyzer + '_' + suffix_gram+ '_Xtrain.pkl', 'rb') as f:
         X_train = pickle.load(f)
    
      with open(REP_INTERMED + analyzer + '_' + suffix_gram+ '_vectorizer.pkl', 'rb') as handle:
         vectorizer = pickle.load(handle)        
        
  return  X_train, vectorizer


def text_to_vect(source_query, vectorizer):

    #normalisation
    query = make_text_prep_func(source_query, name_column_blacklist, name_column_regex_replace)

    #vectorisation du text
    vectorized_querie = vectorizer.transform([query])
   
    return  vectorized_querie

In [None]:
def knn_scratch(source_query, vectorizer, X_train, baseline, sample, nb_top):

    # source_query = 'Bob Marley Is this love'
    if  len(baseline) < nb_top :
      nb_top = len(baseline)
    indice = source_query[0]
    #normalisation
    query = make_text_prep_func(source_query[1], name_column_blacklist, name_column_regex_replace)
    #vectorisation du text
    vectorized_querie = vectorizer.transform([query])
    # calcul de la similarité entre la base de ref et la base d'entrée (données tf-idf sur trigrammes)
    #on obtient un array
    mat_sim = cosine_similarity(X_train, vectorized_querie)

    # fonction pour recuperer celle ayant les similarites élevées 
    #  np.argsort(x[ind]) retourne les indices du vecteur après rearranagement par ordre croissant
    def top_k(x, k):
        ind = np.argpartition(x, -1 * k)[-1 * k:]                  
        return ind[np.argsort(x[ind])]                    

    topsim = np.apply_along_axis(lambda x: top_k(x, nb_top), 0, mat_sim)
    topsim2 = pd.DataFrame( topsim, columns=['index'])

    l_sim = []
    for i in list(topsim2['index']):
        l_sim.append( float(mat_sim[i])*100.0 )
    topsim2['similarity'] = l_sim


 
   #recupération des infos d'origine
    df_nearest = pd.merge(baseline, topsim2, on = 'index', how='inner')

    df_nearest["rank"] =  df_nearest['similarity'].rank(method ='min',ascending = False).astype(int)

    df_nearest['id_query'] =sample.loc[indice,'n_ref']

    df_nearest['text_query'] = make_text_prep_func(sample.loc[indice,'text'])

    distances = ['levenshtein', 'jaro_winkler', 'jaccard', 'overlap', 'hamming', 'fuzzy_partial']

    col_list_values = df_nearest.apply(lambda p:compound_similarity(p, "text_CLEAN", "text_query"), axis=1)
    
    df_nearest.drop(['index','text_query', 'text_CLEAN'], axis=1, inplace=True)

    df_nearest = pd.concat([
                            df_nearest[['id_query', 'master_id', 'artist', 'title', 'similarity', 'rank']], 
                            pd.DataFrame(np.column_stack(col_list_values).T, columns=distances )], 
                          axis= 1
                          )

    df_nearest.sort_values(by='similarity', ascending=False, inplace=True)
    new_row = pd.DataFrame(np.array([[
                                    sample.loc[indice,  'n_ref'],
                                    sample.loc[indice,  'n_ref'], 
                                    sample.loc[indice, 'artiste'],
                                    sample.loc[indice, 'titre'],
                                    float(-1),
                                    0,
                                    1,
                                    1,
                                    1,
                                    1,
                                    1,
                                    1,
                                  ]]
                                  ), 
                        columns= ['id_query', 'master_id', 'artist', 'title', 'similarity', 'rank'] + distances
                        )

    df_nearest = new_row.append(df_nearest, ignore_index=True)
    
    return   df_nearest

In [None]:
def customize_output(i, results, df_baseline, sample) :

  """
   A function to make the output more appealing and informative.


    Arguments:
        i : index for row identification
        results : unsupervised knn api results (list of distances and indexes) 
        df_baseline : discogs data 
        sample  :  sample of cdandlp datasets, size = 1000
   
     Return :
             pd.DataFrame
    """
  distances, indices = results[i]
  # transform brute result to daframe
  temp = pd.concat([ pd.DataFrame(distances).T, pd.DataFrame(indices).T ], axis=1 )
  temp.columns = ['distance','index']
  temp['id_query'] =sample.loc[i,'n_ref']


  # enrcih the dataframe
  indexes = indices.flatten().tolist()
  df_baseline['index'] = df_baseline.index

  neighbors_sklearn_api= pd.merge(
                                  df_baseline[df_baseline['index'].isin(indexes)], temp,
                                  on = 'index',
                                  how ='inner'
                                  ).\
                          sort_values(by='distance')
  # add a column rang
  neighbors_sklearn_api["rank"] = neighbors_sklearn_api['distance'].rank(method ='min',ascending=True).astype(int)



  neighbors_sklearn_api['text_query'] = make_text_prep_func(sample.loc[i,'text'])
  neighbors_sklearn_api.reset_index(drop=True, inplace=True)
  colonnes = ['levenshtein', 'jaro_winkler', 'jaccard', 'overlap', 'hamming', 'fuzzy_partial']

  col_list_values = neighbors_sklearn_api.apply(lambda p:compound_similarity(p, "text_CLEAN", "text_query"), axis=1)

  neighbors_sklearn_api[colonnes] = pd.DataFrame(np.column_stack(col_list_values).T, columns=colonnes)
  neighbors_sklearn_api.drop(['index','text', 'text_CLEAN'],axis=1,inplace=True)


  # add a rows for query 's informations
  column_order = ['id_query', 'master_id', 'artist', 'title', 'distance','rank']  +  colonnes

  neighbors_sklearn_api = neighbors_sklearn_api[column_order]
  new_row = pd.DataFrame(np.array([[
                                    sample.loc[i,  'n_ref'],
                                    sample.loc[i,  'n_ref'], 
                                    sample.loc[i, 'artiste'],
                                    sample.loc[i, 'titre'],
                                    float(-1),
                                    0,   1,   1, 1,  1,  1,    1,]])
                        , columns=column_order
                        )
  neighbors_sklearn_api = new_row.append(neighbors_sklearn_api, ignore_index=True)


  return neighbors_sklearn_api

In [None]:
def customize_output1(i, results, df_baseline, sample) :

  """
   A function to make the output more appealing and informative.


    Arguments:
        i : index for row identification
        results : unsupervised knn api results (list of distances and indexes) 
        df_baseline : discogs data 
        sample  :  sample of cdandlp datasets, size = 1000
   
     Return :
             pd.DataFrame
    """
  distances, indices = results[i]
  # transform brute result to daframe
  temp = pd.concat([ pd.DataFrame(distances).T, pd.DataFrame(indices).T ], axis=1 )
  temp.columns = ['distance','index']
  temp['id_query'] =sample.iloc[i,0]


  # enrcih the dataframe
  indexes = indices.flatten().tolist()
  df_baseline['index'] = df_baseline.index

  neighbors_sklearn_api= pd.merge(
                                  df_baseline[df_baseline['index'].isin(indexes)], temp,
                                  on = 'index',
                                  how ='inner'
                                  ).\
                          sort_values(by='distance')
  # add a column rang
  neighbors_sklearn_api["rank"] = neighbors_sklearn_api['distance'].rank(method ='min',ascending=False).astype(int)
  neighbors_sklearn_api["QUERY"] = sample.loc[i, 'artiste']	 + " "  + sample.loc[i, 'titre']
  neighbors_sklearn_api.drop(['index'],axis=1,inplace=True)


  return neighbors_sklearn_api

In [None]:
def create_vectorizer(i, df_items, df_sample):

    """
    JUST FOR BLOCKING

    La fonction retourne la base de référence vectorisée + le vectorize lui même
    
    args : dictionnaire avec les paramètres  : analyzer, ngram_range et use_id pour le vectorizer
    
    """      

    if df_items.empty:
      print('DataFrame is empty!') 
    else :
        #creation du vectorizer et de la base vectorisée
        
      vectorizer = TfidfVectorizer(analyzer = 'char', ngram_range=(2,2) , use_idf = False)
      X_train = vectorizer.fit_transform(df_items['text_CLEAN'].str.replace(' ', ''))
      
      #on "exécute" le vectorizer et en sortie on a une matrice sparse + le vocabulaire associé


      suffix_gram = 'batch_' + str(i)


      path = REP_INTERMED + suffix_gram+  '_results_sim_cosine.pkl'

      nbrs = NearestNeighbors(n_neighbors=20, metric="cosine",algorithm='brute', n_jobs=-1).fit(X_train)


      result = [ nbrs.kneighbors(text_to_vect(df_sample.loc[i,'text'], vectorizer)) for i in range (len(df_sample)) ]

      print(i)

      with open(path, 'wb') as f:
        
        pickle.dump(result, f) 





In [None]:
def knn_words_distance(indice, baseline, sample, nb_top):
 
    
    baseline['id_query'] =sample.loc[indice,'n_ref']

    # compute distance
    distances = ['levenshtein', 'jaro_winkler', 'jaccard', 'overlap', 'hamming', 'fuzzy_partial']
    col_list_values = baseline.apply(lambda p:compound_similarity(p, "text_CLEAN", "text_query"), axis=1)
    baseline.drop(['index','text_query', 'text_CLEAN'], axis=1, inplace=True)

    # merge distance with inital dataset
    df_nearest = pd.concat(  [baseline[['id_query', 'master_id', 'artist', 'title']]
                            , pd.DataFrame(np.column_stack(col_list_values).T, columns=distances)], 
                          axis= 1)
    df_nearest.sort_values(by='levenshtein', ascending=False, inplace=True)
    df_nearest["rank"] =  df_nearest['levenshtein'].rank(method ='min',ascending = False).astype(int)

    # first row for query 's identification
    new_row = pd.DataFrame(np.array([[
                                    sample.loc[indice,  'n_ref'],
                                    sample.loc[indice,  'n_ref'], 
                                    sample.loc[indice, 'artiste'],
                                    sample.loc[indice, 'titre'],
                                    1,  1,  1,  1,  1,  1,]]), 
                        columns= ['id_query', 'master_id', 'artist', 'title'] + distances
                        )

    df_nearest = new_row.append(df_nearest, ignore_index=True)
    
    return    df_nearest[df_nearest["rank"]< nb_top]

In [None]:
def knn_country(df_items, df_sample) :
  
    # entrainement et vectorisation
    vectorizer = TfidfVectorizer(analyzer = 'char', ngram_range=(2,2) , use_idf = False)
    X_train = vectorizer.fit_transform(df_items['text_CLEAN'].str.replace(' ', ''))

    # exécution du knn
    df_items['index'] = df_items.index
    func = partial(knn_scratch, vectorizer=vectorizer, X_train=X_train, baseline=df_items.copy(), sample =df_sample.copy(), nb_top=20)
    answer = list(map(func,      [[i, df_sample.loc[i,'text']] for  i in range(len(df_sample))]))

    # export
    with open(REP_INTERMED + df_items.loc[0,'country'] + '_results_sim.pkl', 'wb') as f:
        pickle.dump(answer, f) 
    return answer

In [None]:
def knn_scratch_(source_query, vectorizer, X_train, baseline, sample, nb_top, indices):
  
    # source_query = 'Bob Marley Is this love'
    if  len(baseline) < nb_top :
      nb_top = len(baseline)
    indice = source_query[0]
    #normalisation
    query = make_text_prep_func(source_query[1], name_column_blacklist, name_column_regex_replace)
    #vectorisation du text
    vectorized_querie = vectorizer.transform([query])
    # calcul de la similarité entre la base de ref et la base d'entrée (données tf-idf sur trigrammes)
    #on obtient un array
    mat_sim = cosine_similarity(X_train[indices], vectorized_querie)

    # fonction pour recuperer celle ayant les similarites élevées 
    #  np.argsort(x[ind]) retourne les indices du vecteur après rearranagement par ordre croissant
    def top_k(x, k):
        ind = np.argpartition(x, -1 * k)[-1 * k:]                  
        return ind[np.argsort(x[ind])]                    

    topsim = np.apply_along_axis(lambda x: top_k(x, nb_top), 0, mat_sim)
    topsim2 = pd.DataFrame( topsim, columns=['index'])

    l_sim = []
    for i in list(topsim2['index']):
        l_sim.append( float(mat_sim[i])*100.0 )
    topsim2['similarity'] = l_sim


 
   #recupération des infos d'origine
    df_nearest = pd.merge(baseline, topsim2, on = 'index', how='inner')

    df_nearest["rank"] =  df_nearest['similarity'].rank(method ='min',ascending = False).astype(int)
    df_nearest['id_query'] =sample.loc[indice,'n_ref']
    df_nearest['text_query'] = make_text_prep_func(sample.loc[indice,'text'])
    distances = ['levenshtein', 'jaro_winkler', 'jaccard', 'overlap', 'hamming', 'fuzzy_partial']
    col_list_values = df_nearest.apply(lambda p:compound_similarity(p, "text_CLEAN", "text_query"), axis=1)
    df_nearest.drop(['index','text_query', 'text_CLEAN'], axis=1, inplace=True)

    df_nearest = pd.concat([
                            df_nearest[['id_query', 'master_id', 'artist', 'title', 'similarity', 'rank']], 
                            pd.DataFrame(np.column_stack(col_list_values).T, columns=distances )],  axis= 1 )

    df_nearest.sort_values(by='similarity', ascending=False, inplace=True)
    new_row = pd.DataFrame(np.array([[
                                    sample.loc[indice,  'n_ref'],
                                    sample.loc[indice,  'n_ref'], 
                                    sample.loc[indice, 'artiste'],
                                    sample.loc[indice, 'titre'],
                                    float(-1),
                                    0,  1, 1, 1, 1,  1,1,]]), 
                        columns= ['id_query', 'master_id', 'artist', 'title', 'similarity', 'rank'] + distances)
    df_nearest = new_row.append(df_nearest, ignore_index=True)
    return   df_nearest

In [None]:
#def knn_words(df_items, df_sample, indices) :
#
#    df_items['index'] = df_items.index
#    answer = knn_scratch_([0, df_sample.loc[0,'text']] ,indices =indices ,vectorizer=vectorizer, X_train=X_train, baseline=df_items.copy(), sample =df_sample.copy(), nb_top=20)
#    return answer


In [None]:
def level1_knn_words(index, df, baseline, dict_words):
  """
   A function to run knn for with a dataset of queries.


    Arguments:
        index : row identification
        df_baseline : discogs data 
        df  :  sample of cdandlp datasets, size = 1000
        dict_words  : dictionary (value=words, key=indexes of df_baseline)
   
     Return :
             pd.DataFrame
  """

  row = df.loc[index, 'text']
  # retrieve the list of indices by word
  indices_list = [dict_words.get(key) for key in row.split()]
  distances = ['levenshtein', 'jaro_winkler', 'jaccard', 'overlap', 'hamming', 'fuzzy_partial']
  while None in indices_list:
      indices_list.remove(None)
  # convert the list of lists to flat list
  indices_list_flat = list(set([item for sublist in indices_list for item in sublist]))
  if indices_list_flat == [] :
    new_row = pd.DataFrame(np.array([[
                                    df.loc[index,  'n_ref'],
                                    df.loc[index,  'n_ref'], 
                                    df.loc[index, 'artiste'],
                                    df.loc[index, 'titre'],
                                    float(-1), 0,   1,   1, 1,  1,  1,    1,]])
                        , columns= ['id_query', 'master_id', 'artist', 'title', 'similarity','rank'] + distances    

                        )
    return new_row
  # else
  # match indexes with baseline indexes
  df_baseline = baseline.iloc[indices_list_flat,].reset_index(drop=True)
  df_sample = df.iloc[[index,]].reset_index(drop='True')
  df_baseline['index'] = df_baseline.index
  answer = knn_scratch_([0, df_sample.loc[0,'text']] ,indices =indices_list_flat ,vectorizer=vectorizer, X_train=X_train, baseline=df_baseline.copy(), sample =df_sample.copy(), nb_top=50)
  return answer


In [None]:
def customize_output_blocking(i, results, df_baseline, sample) :

  """
   A function to make the output more appealing and informative.


    Arguments:
        i : index for row identification
        results : unsupervised knn api results (list of distances and indexes) 
        df_baseline : discogs data 
        sample  :  sample of cdandlp datasets, size = 1000
   
     Return :
             pd.DataFrame
    """
  distances, indices = results
  # transform brute result to daframe
  temp = pd.concat([ pd.DataFrame(distances).T, pd.DataFrame(indices).T ], axis=1 )
  temp.columns = ['distance','index']
  temp['id_query'] =sample.loc[i,'n_ref']


  # enrcih the dataframe
  indexes = indices.flatten().tolist()
  df_baseline['index'] = df_baseline.index

  neighbors_sklearn_api= pd.merge(
                                  df_baseline[df_baseline['index'].isin(indexes)], temp,
                                  on = 'index',
                                  how ='inner'
                                  ).\
                          sort_values(by='distance')
  # add a column rang
  neighbors_sklearn_api["rank"] = neighbors_sklearn_api['distance'].rank(method ='min',ascending=True).astype(int)



  neighbors_sklearn_api['text_query'] = make_text_prep_func(sample.loc[i,'text'])
  neighbors_sklearn_api.reset_index(drop=True, inplace=True)
  colonnes = ['levenshtein', 'jaro_winkler', 'jaccard', 'overlap', 'hamming', 'fuzzy_partial']

  col_list_values = neighbors_sklearn_api.apply(lambda p:compound_similarity(p, "text_CLEAN", "text_query"), axis=1)

  neighbors_sklearn_api[colonnes] = pd.DataFrame(np.column_stack(col_list_values).T, columns=colonnes)

  neighbors_sklearn_api.drop(['index', 'text_CLEAN'],axis=1,inplace=True)


  # add a rows for query 's informations
  column_order = ['id_query', 'master_id', 'artist', 'title', 'distance','rank']  +  colonnes

  neighbors_sklearn_api = neighbors_sklearn_api[column_order]
  new_row = pd.DataFrame(np.array([[
                                    sample.loc[i,  'n_ref'],
                                    sample.loc[i,  'n_ref'], 
                                    sample.loc[i, 'artiste'],
                                    sample.loc[i, 'titre'],
                                    float(-1),
                                    0,   1,   1, 1,  1,  1,    1,]])
                        , columns=column_order
                        )
  neighbors_sklearn_api = new_row.append(neighbors_sklearn_api, ignore_index=True)


  return neighbors_sklearn_api

# Recode pressage

In [None]:
def recode_pressage(df):
        map = {
          'Suisse' : 'Switzerland',
         'Swiss' : 'Switzerland',
         'Union Europeenne' : 'Europe', 
         'SB' : 'Solomon Islands',
         'Fl' : 'Liechtenstein',
         'Ukr' : 'Ukraine',
         'Ukraineaineaineaineaineaine' : 'Ukraine',
         'Ussrsia' : 'Ussr',
         'French' : 'France',
         'Francais' : 'France',
         'Français' : 'France',
         'Fr -' : 'France',
         'Russia' :'Ussr',
         'United Kingdom' : 'Uk',
         'United Kingdom' : 'Uk',
         'Angleterre' : 'Uk',
         'Royaume Uni' : 'Uk',
         'Italie': 'Italy',
         'Italia': 'Italy',
         'Anglais' : 'Uk',
         'England': 'Uk',
         'Usa': 'Us',
         'U.S.A' : 'Us',
         'Us.'   : 'Us',
         'Belgique' :'Belgium',
         'Belge'  :'Belgium',
         'España': 'Spain',
         'Espagne' : 'Spain',
         'Europerope' : 'Europe',
         'Grèce' : 'Greece',
         'U.K.' : 'Uk',
         'Holland' : 'Netherlands',
         'Netherlandse' : 'Netherlands',
         'Deutschland' : 'Germany',
         'Germany.' :	 'Germany',
          '. Germany' :	 'Germany',
          'Japon' : 'Japan',
          'E.U' : 'Europe',
          'Al'  : 'Albania',
          'Eu' : 'Europe',
          'London' : 'Uk',
           'Gb'  : 'Uk',
           'é'  : 'e',
           'Ru' : 'Ussr',
           'Nederland' : 'Netherlands',
           'Netherlandse' : 'Netherlands',
           'Brasil' : 'Brazil',
           'Bresil' :'Brazil',
           'Coreen' : 'South Korea',
           'Australiia' : 'Australia',
           'Australie' : 'Australia',
           'Pays Bas' : 'Netherlands',
           'Etats Unis' : 'Us',
           'United States' : 'Us',
           'Esp' :'Spain',
           'G.B' :'Uk',
           'Swe' :  'Sweden', 
           'Original'  : '',
           '(Original)' : ''
        }


        ## standarized  countries' Format 
        df = df.replace({'pressage' : map},regex=True)
        # ++++++++++++++++++ many digits in column pressage ++++++++++++++++++++++++++++
        df['pressage'] = df['pressage'].str.replace('\d+', '')
        # ++++++++++++++++++ Needless words ++++++++++++++++++++++++++++
        df['pressage'] = df['pressage'].str.replace('(Original)', '')
        df['pressage'] = df['pressage'].str.replace('Made In', '')
        df['pressage'] = df['pressage'].str.replace('Original', '')
        df['pressage'] = df['pressage'].str.replace('Limited', '')
        df['pressage'] = df['pressage'].str.replace('Press', '')
        df['pressage'] = df['pressage'].str.replace('Biem', '')
        df['pressage'] = df['pressage'].str.replace('Biem', '')
        # ++++++++++++++++++ Brute recoding++++++++++++++++++++++++++++
        df['pressage'] = df['pressage'].str.replace('Europerope', 'Europe')
        df['pressage'] = df['pressage'].str.replace('Netherlandse', 'Netherlands')
        df['pressage'] = df['pressage'].str.replace('Nl', 'Netherlands')
        df['pressage'] = df['pressage'].str.replace('Dutch', 'Netherlands')
        df['pressage'] = df['pressage'].str.replace('Swedenden', 'Sweden')
        df['pressage'] = df['pressage'].str.replace('Albanialemagn', 'Albania')
        df['pressage'] = df['pressage'].str.replace('Albanial', 'Albania')
        df['pressage'] = df['pressage'].str.replace('Albaniae', 'Albania')
        df['pressage'] = df['pressage'].str.replace('WesGermany', 'Germany')
        # ++++++++++++++++++++ NEXT STEP ++++++++++++++++++++++++++++++++++++
        df['pressage'] = df['pressage'].str.strip()
        df['pressage'] = df['pressage'].str.replace('Usr', 'Ussr')
        df['pressage'] = np.where( df['pressage'] == 'German' , 'Germany',  df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Gdr' , 'Germany',  df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Bel' , 'Belgium',  df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'European Union' , 'Europe',  df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Swedenden' , 'Sweden',  df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Italyn' , 'Italy',  df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'U.K' , 'Uk',  df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Netherlandsais' , 'Netherlands',  df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Netherlandsland' , 'Netherlands',  df['pressage'])   
        df['pressage'] = np.where( df['pressage'] == 'Albaniamand', 'Albania',df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Franc -', 'France -',df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Stereo Germany'	, 'Germany',df['pressage'])
        df['pressage'] = np.where( df['pressage'] =='U.S', 'Us', df['pressage'])
        df['pressage'] = np.where( df['pressage'] =='Fra', 'France', df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Deutchland - Deutchland', ' Germany', df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Autriche', 'Austria', df['pressage'])
        df['pressage'] = np.where( df['pressage'] =='Made In Us', 'Us', df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Canadien', 'Canada', df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Can', 'Canada', df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Pl', 'Poland', df['pressage'])
        df['pressage'] = np.where( df['pressage'] == '- Ue', 'Europe', df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Epc', 'Europe', df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Ita' , 'Italy', df['pressage'])
        # ++++++++++++++++++++ NEXT STEP ++++++++++++++++++++++++++++++++++++
        df['pressage'] = df['pressage'].str.strip('.()- ')
        df['pressage'] = np.where( df['pressage'] == 'U.S' ,'Us', df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Hol' , 'Netherlands',  df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Ho' , 'Netherlands',  df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Ukraineaine' , 'Ukraine',  df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Atl Germany' , 'Germany',  df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Fr' , 'France',  df['pressage'])
        df['pressage'] = np.where( df['pressage'] ==  'Ecc', 'Europe', df['pressage'])
        df['pressage'] = np.where( df['pressage'] ==  'Eec', 'Europe', df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Albaniagerie' , 'Albania',  df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Ue', 'Europe', df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'U.E', 'Europe', df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Ca', 'Canada', df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Europeropa', 'Europe', df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'The Netherlands', 'Netherlands', df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Holl', 'Netherlands', df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Netherland', 'Netherlands', df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'France C', 'France', df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'German' , 'Germany',  df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'De' , 'Germany',  df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Ger' , 'Germany',  df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'Atl Germany' , 'Germany',  df['pressage'])
        df['pressage'] = np.where( df['pressage'] == 'France Sans' , 'France',  df['pressage'])
        df['pressage'] = np.where( df['pressage']	== 'Türkiye', 'Turkey', df['pressage'])
        df['pays']  = df['pressage'].apply(lambda p : list(p.split('-')))
        

        def recode_country(elements):

          for element in elements :
              if element.strip() in list_country:
                return  element
          return 'NaN'

        df['pays_clean'] = df['pays'].apply(recode_country)
        
        return df