In [1]:
import pandas as pd
from pathlib import Path

from geopy.geocoders import Nominatim
import numpy as np
from tqdm import tqdm_notebook as tqdm


In [2]:
DATA_DIR = Path("/home/knikaido/work/atma10/data/")

In [12]:
# data load
production_place = pd.read_csv(DATA_DIR / 'production_place.csv')
production_place.head()

Unnamed: 0,object_id,name
0,0012765f7a97ccc3e9e9,Netherlands
1,00133be3ff222c9b74b0,Amsterdam
2,00133be3ff222c9b74b0,Northern Netherlands
3,0017be8caa87206532cb,Amsterdam
4,001b2b8c9d3aa1534dfe,Suriname


### 地名を国名の粒度になおして，one_hot化して保存
https://www.guruguru.science/competitions/16/discussions/970ced6d-f974-4979-8f04-dbcf1c2f51a0/

In [13]:
def place2country(address):
    geolocator = Nominatim(user_agent='sample', timeout=200)
    loc = geolocator.geocode(address, language='en')
    coordinates = (loc.latitude, loc.longitude)
    location = geolocator.reverse(coordinates, language='en')
    country = location.raw['address']['country']
    return country

In [14]:
place_list = production_place['name'].unique()
country_dict = {}
for place in tqdm(place_list):
    try:
        country = place2country(place)
        country_dict[place] = country
    except:
        # 国名を取得できない場合はnan
        print(place)
        country_dict[place] = np.nan

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/144 [00:00<?, ?it/s]

Indonesian Archipelago


In [15]:
production_place['country_name'] = production_place['name'].map(country_dict)
production_place

Unnamed: 0,object_id,name,country_name
0,0012765f7a97ccc3e9e9,Netherlands,Netherlands
1,00133be3ff222c9b74b0,Amsterdam,Netherlands
2,00133be3ff222c9b74b0,Northern Netherlands,Netherlands
3,0017be8caa87206532cb,Amsterdam,Netherlands
4,001b2b8c9d3aa1534dfe,Suriname,Suriname
...,...,...,...
17681,fff08e76cbb969eaddc7,Northern Netherlands,Netherlands
17682,fff08e76cbb969eaddc7,Antwerp,Belgium
17683,fffbe07b997bec00e203,Great Britain,United Kingdom
17684,fffd43b134ba7197d890,London,United Kingdom


In [24]:
production_place['country_name'].unique()

array(['Netherlands', 'Suriname', 'Belgium', 'Germany', 'France',
       'United Kingdom', 'Italy', 'Japan', 'Indonesia', 'Norway',
       'United States', 'India', 'Iran', 'Denmark', 'Switzerland',
       'Austria', 'Spain', 'China', 'Poland', 'Sri Lanka', 'Russia', nan,
       'Greece', 'Canada'], dtype=object)

In [17]:
production_place[production_place['name']=='Indonesian Archipelago']['country_name'] = 'Indonesia'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [18]:
production_place.drop('name', axis=1, inplace=True)

In [19]:
production_place_country = production_place[~production_place.duplicated(subset=['object_id', 'country_name'])]
production_place_country.reset_index(drop=True, inplace=True)
production_place_country

Unnamed: 0,object_id,country_name
0,0012765f7a97ccc3e9e9,Netherlands
1,00133be3ff222c9b74b0,Netherlands
2,0017be8caa87206532cb,Netherlands
3,001b2b8c9d3aa1534dfe,Suriname
4,001c52ae28ec106d9cd5,Netherlands
...,...,...
16373,fff08e76cbb969eaddc7,Netherlands
16374,fff08e76cbb969eaddc7,Belgium
16375,fffbe07b997bec00e203,United Kingdom
16376,fffd43b134ba7197d890,United Kingdom


In [23]:
production_place_country['country_name'].unique()

array(['Netherlands', 'Suriname', 'Belgium', 'Germany', 'France',
       'United Kingdom', 'Italy', 'Japan', 'Indonesia', 'Norway',
       'United States', 'India', 'Iran', 'Denmark', 'Switzerland',
       'Austria', 'Spain', 'China', 'Poland', 'Sri Lanka', 'Russia', nan,
       'Greece', 'Canada'], dtype=object)

In [20]:
production_place_country.to_csv(DATA_DIR / 'production_place_country_raw.csv', index=False)

In [36]:
production_onehot_country = pd.get_dummies(production_place_country['country_name'])
production_onehot_country['object_id'] = production_place_country['object_id']
production_onehot_country

Unnamed: 0,Austria,Belgium,Canada,China,Denmark,France,Germany,Greece,India,Indonesia,...,Norway,Poland,Russia,Spain,Sri Lanka,Suriname,Switzerland,United Kingdom,United States,object_id
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0012765f7a97ccc3e9e9
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,00133be3ff222c9b74b0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0017be8caa87206532cb
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,001b2b8c9d3aa1534dfe
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,001c52ae28ec106d9cd5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16373,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,fff08e76cbb969eaddc7
16374,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,fff08e76cbb969eaddc7
16375,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,fffbe07b997bec00e203
16376,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,fffd43b134ba7197d890


In [47]:
production_onehot_country[production_onehot_country.duplicated(subset=['object_id'])]

Unnamed: 0,Austria,Belgium,Canada,China,Denmark,France,Germany,Greece,India,Indonesia,...,Norway,Poland,Russia,Spain,Sri Lanka,Suriname,Switzerland,United Kingdom,United States,object_id
13,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,003a628b7e381d284697
15,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,003b386ec21ca47dc206
34,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,00846349f5eedc496cc4
37,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0087ebe7951c787c5da5
106,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,017f4fe7d662027d99d3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16314,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ff4822c3c95b121223b5
16343,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ffa9764e5816bd9c3d8e
16367,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ffeb384305262f3500f5
16370,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ffee34705ea44e1a0f79


↓これで重複行を足し算できる

In [48]:
production_onehot_country_ = production_onehot_country.groupby('object_id').apply(lambda x: x.sum()).drop('object_id',axis=1).reset_index()

In [50]:
production_onehot_country_.to_csv(DATA_DIR / 'production_place_country.csv', index=False)