# Split and Merge Transalation files

Workflow: https://docs.google.com/document/d/11d5D5CPlr6I2BqP8z0p2o_dYQC9AYUyxcLGUq1WYCfk/edit#

In [1]:
import pandas as pd
import json
import os

In [2]:
# see what's in datapackage.json

datapackagejson = '../../datapackage.json'

In [3]:
!jq 'keys' $datapackagejson

[1;39m[
  [0;32m"author"[0m[1;39m,
  [0;32m"description"[0m[1;39m,
  [0;32m"language"[0m[1;39m,
  [0;32m"license"[0m[1;39m,
  [0;32m"name"[0m[1;39m,
  [0;32m"resources"[0m[1;39m,
  [0;32m"title"[0m[1;39m,
  [0;32m"translations"[0m[1;39m,
  [0;32m"version"[0m[1;39m
[1;39m][0m


In [4]:
!jq '.translations' $datapackagejson

[1;39m{
  [0m[34;1m"id"[0m[1;39m: [0m[0;32m"nl-nl"[0m[1;39m,
  [0m[34;1m"name"[0m[1;39m: [0m[0;32m"Nederlands (Nederland)"[0m[1;39m
[1;39m}[0m


In [5]:
!jq '.resources[1]' $datapackagejson

[1;39m{
  [0m[34;1m"path"[0m[1;39m: [0m[0;32m"ddf--datapoints--company_size--by--company--anno.csv"[0m[1;39m,
  [0m[34;1m"name"[0m[1;39m: [0m[0;32m"ddf--datapoints--company_size--by--company--anno"[0m[1;39m,
  [0m[34;1m"schema"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"fields"[0m[1;39m: [0m[1;39m[
      [1;39m{
        [0m[34;1m"name"[0m[1;39m: [0m[0;32m"company"[0m[1;39m
      [1;39m}[0m[1;39m,
      [1;39m{
        [0m[34;1m"name"[0m[1;39m: [0m[0;32m"anno"[0m[1;39m
      [1;39m}[0m[1;39m,
      [1;39m{
        [0m[34;1m"name"[0m[1;39m: [0m[0;32m"company_size"[0m[1;39m
      [1;39m}[0m[1;39m
    [1;39m][0m[1;39m,
    [0m[34;1m"primaryKey"[0m[1;39m: [0m[1;39m[
      [0;32m"company"[0m[1;39m,
      [0;32m"anno"[0m[1;39m
    [1;39m][0m[1;39m
  [1;39m}[0m[1;39m
[1;39m}[0m


In [7]:
with open(datapackagejson) as f:
    datapackage = json.load(f)

# split

Let's say we are going to create a `zh_cn` folder

In [16]:
lang = 'zh_cn'

In [17]:
os.makedirs('../../lang/'+lang)

In [9]:
concepts = pd.read_csv('../../ddf--concepts.csv')

In [10]:
concepts = concepts.set_index('concept')

In [11]:
concepts.head()

Unnamed: 0_level_0,concept_type,domain
concept,Unnamed: 1_level_1,Unnamed: 2_level_1
company,entity_domain,
english_speaking_company,entity_set,company
foundation,entity_set,company
name,string,
anno,time,year


In [29]:
# create all files needed

basepath = os.path.join('../../lang', lang)

# columns don't need to be translated
# if a concept is not in ddf--concept, you should mask it here.
excluded_cols = ['concept', 'concept_type', 'domain']

for res in datapackage['resources']:
    
#     # only run for entities
#     if not ('ddf--entities' in res['path'] or 'ddf--concepts' in res['path']):
#         continue
    
    path = res['path']
    key = res['schema']['primaryKey']
    
    # create folder
#     os.makedirs(os.path.join(basepath, path), exist_ok=True)
    
    # create each translation file.
    df = pd.read_csv(os.path.join('../../', path))
    
    for c in df.columns:
        if c in excluded_cols:
            continue
        if c.startswith('is--'): # it will be boolean, skip
            continue
            
        try:
            if concepts.loc[c, 'concept_type'] == 'string':
                os.makedirs(os.path.join(basepath, path), exist_ok=True)
                df.set_index(key)[[c]].to_csv(os.path.join(basepath, path, '{}.csv'.format(c)))
        except KeyError:
            print('concept not found: ' + c)
            continue

concept not found: full_name
concept not found: full_name
concept not found: size


In [30]:
!tree ../../lang/zh_cn/

../../lang/zh_cn/
├── ddf--datapoints--company_size_string--by--company--anno.csv
│   └── company_size_string.csv
├── ddf--entities--company--english_speaking.csv
│   └── name.csv
├── ddf--entities--company.csv
│   ├── country.csv
│   └── name.csv
└── ddf--entities--region.csv
    └── name.csv

4 directories, 5 files


After we finish translate the files, we should merge the splited files back to one file 

In [31]:
[x for x in os.walk(basepath)]

[('../../lang/zh_cn',
  ['ddf--datapoints--company_size_string--by--company--anno.csv',
   'ddf--entities--company--english_speaking.csv',
   'ddf--entities--company.csv',
   'ddf--entities--region.csv'],
  []),
 ('../../lang/zh_cn/ddf--datapoints--company_size_string--by--company--anno.csv',
  [],
  ['company_size_string.csv']),
 ('../../lang/zh_cn/ddf--entities--company--english_speaking.csv',
  [],
  ['name.csv']),
 ('../../lang/zh_cn/ddf--entities--company.csv',
  [],
  ['country.csv', 'name.csv']),
 ('../../lang/zh_cn/ddf--entities--region.csv', [], ['name.csv'])]

In [39]:
import shutil

In [40]:
for f in next(os.walk(basepath))[1]:
    dir_path = os.path.join(basepath, f)
    to_concat = [pd.read_csv(os.path.join(dir_path, x), index_col=0) for x in os.listdir(dir_path)]
    
    df = pd.concat(to_concat, axis=1)
    
    # remove the folders first
    shutil.rmtree(dir_path)
    # then save csv file to the same path
    df.to_csv(dir_path)