### Mapping the values with original data 

#### 1. Feature variations for sherlock data

In [17]:
import pandas as pd
import json
from collections import defaultdict
import numpy as np
import random

#### 2. Reading the mapping and features files

In [2]:
### Mapping file for each type
mapping_file = pd.read_excel(r'data/Column_Mapping.xlsx')

In [3]:
### Get the json file with each features
with open(r"data\corrected_combined_with_features.json",'rb') as f:
    feats_json = json.load(f)

#### 3.1.  Data related changes - Mapping File

In [4]:
### Mapping format to long format
split_df = pd.DataFrame(mapping_file.Variations.str.split(',').tolist(), index=mapping_file.type).stack().reset_index()
split_df.columns = ['type','index','variation']
split_df = split_df[['type','variation']]
split_df['variation'] = split_df['variation'].str.strip().str.lower()
split_df['type'] = split_df['type'].str.strip().str.lower()

In [5]:
original_type = split_df[['type','type']].drop_duplicates()
original_type.columns = ['type','variation']

In [6]:
final_df = pd.concat([split_df,original_type])
final_df = final_df.drop_duplicates().reset_index(drop=True)
final_df['variation'] = final_df['variation'].str.strip()

In [7]:
key_dict = defaultdict(list)

for i,row in final_df.iterrows():
    key_dict[row['type']].append(row['variation'])

#### 3.2.  Data related changes - Sherlock File

In [8]:
sherlock_data = pd.DataFrame(feats_json)
sherlock_data['type'] = sherlock_data['type'].str.lower().str.strip()

In [9]:
sherlock_count = sherlock_data['type'].value_counts().to_dict()

#### 4. Logic Mappping

In [10]:
sherlock_key_map = {}

for keys, values in sherlock_count.items():
    variations = key_dict[keys]
    if len(variations)>0:
        extras = []
        output = list(np.repeat(variations,values/len(variations)))
        if values-len(output) >0:
            extras = variations[:values-len(output)]
        final_output = output + extras
    else:
        final_output = variations * values
    
    sherlock_key_map[keys] = final_output

#### 5. Validating the results

In [11]:
for key, value in sherlock_count.items():
    if sherlock_count[key] != len(sherlock_key_map[key]):
        print('{} is not matched in row count'.format(key))

#### 6. Randomly shuffle elements in each data

In [25]:
type_feats_mapping = {}

for keys,values in sherlock_key_map.items():
    random.shuffle(values)
    type_feats_mapping[keys] = values
    
type_feats_mapping = dict(type_feats_mapping)

#### 7. Mapping features of Sherlock data with values

In [31]:
pd.DataFrame(type_feats_mapping.items())

Unnamed: 0,0,1
0,jockey,"[jockey information, jockey name, jockey infor..."
1,plays,"[plays, plays, plays, plays, plays, plays, pla..."
2,album,"[music name, music collection, music collectio..."
3,club,"[group name, club name, league name, club, all..."
4,owner,"[co founder name, co founder, owner, owner, co..."
...,...,...
73,grades,"[class name, grade name, section name, class n..."
74,currency,"[currency name (country), currency type, curre..."
75,sex,"[gender type, gender code, gender name, gender..."
76,isbn,"[isbn, isbn identifier, isbn, isbn identifier,..."
