In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

from gensim.models import word2vec, KeyedVectors
from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [2]:
sub = pd.read_csv('../../data/atmacup10__sample_submission.csv')
color = pd.read_csv('../../data/color.csv')
historical_person = pd.read_csv('../../data/historical_person.csv')
maker = pd.read_csv('../../data/maker.csv')
material = pd.read_csv('../../data/material.csv')
collection = pd.read_csv('../../data/object_collection.csv')
palette = pd.read_csv('../../data/palette.csv')
principal_maker_occupation = pd.read_csv('../../data/principal_maker_occupation.csv')
principal_maker = pd.read_csv('../../data/principal_maker.csv')

#production_place = pd.read_csv('../../data/production_place.csv')
production = pd.read_csv('../../extract_data/production_place_rename.csv').iloc[:,1:][['object_id','name']]
production_country = pd.read_csv('../../extract_data/production_place_rename.csv').iloc[:,1:][['object_id','country_name']].rename(columns={'country_name':'name'})

technique = pd.read_csv('../../data/technique.csv')
test = pd.read_csv('../../data/test.csv')
train = pd.read_csv('../../data/train.csv')

In [8]:
import yaml
with open(f'material.yml') as file:
    material_dict = yaml.safe_load(file)

In [30]:
material.head(2)

Unnamed: 0,object_id,name
0,000405d9a5e3f49fc49d,paper
1,000405d9a5e3f49fc49d,board


In [29]:
def change_material(value):
    for material in material_dict.keys():        
        if value in material_dict[material]:
            return material

material['name'] = material['name'].apply(lambda x : change_material(x))

In [31]:
material.head()

Unnamed: 0,object_id,name
0,000405d9a5e3f49fc49d,paper
1,000405d9a5e3f49fc49d,board
2,001020bd00b149970f78,oil
3,001020bd00b149970f78,panel
4,0011d6be41612ec9eae3,oil


In [5]:
production_country.head()

Unnamed: 0,object_id,name
0,0012765f7a97ccc3e9e9,Netherlands
1,00133be3ff222c9b74b0,Netherlands
2,00133be3ff222c9b74b0,Netherlands
3,0017be8caa87206532cb,Netherlands
4,001b2b8c9d3aa1534dfe,Suriname


In [32]:
production_country['name'] = production_country['name'].fillna('missing')

In [33]:
material.groupby("object_id")["name"].apply(list)

object_id
000405d9a5e3f49fc49d    [paper, board]
001020bd00b149970f78      [oil, panel]
0011d6be41612ec9eae3     [oil, canvas]
0012765f7a97ccc3e9e9           [paper]
00133be3ff222c9b74b0           [paper]
                             ...      
fff1d87d79953ddab2c6      [oil, panel]
fff4bbb55fd7702d294e           [paper]
fffbe07b997bec00e203    [paper, board]
fffd43b134ba7197d890           [paper]
ffff22ea12d7f99cff31    [paper, paper]
Name: name, Length: 23586, dtype: object

In [34]:
mat_col = pd.concat([material, collection], axis=0).reset_index(drop=True)
mat_tec = pd.concat([material, technique], axis=0).reset_index(drop=True)
col_tec = pd.concat([collection, technique], axis=0).reset_index(drop=True)
mat_col_tec = pd.concat([material, collection, technique], axis=0).reset_index(drop=True)

mat_col_tec_prod = pd.concat([material, collection, technique, production], axis=0).reset_index(drop=True)
mat_col_tec_prod_country = pd.concat([material, collection, technique, production_country], axis=0).reset_index(drop=True)

In [35]:
mat_col_tec_prod_country.groupby("object_id")["name"].apply(list)

object_id
000405d9a5e3f49fc49d                        [paper, board, albumen print]
001020bd00b149970f78                              [oil, panel, paintings]
0011d6be41612ec9eae3                             [oil, canvas, paintings]
0012765f7a97ccc3e9e9    [paper, salted paper print, albumen print, Net...
00133be3ff222c9b74b0    [paper, prints, etching, Netherlands, Netherla...
                                              ...                        
fff4bbb55fd7702d294e                               [paper, albumen print]
fffbe07b997bec00e203        [paper, board, albumen print, United Kingdom]
fffd1675758205748d7f                                      [albumen print]
fffd43b134ba7197d890               [paper, albumen print, United Kingdom]
ffff22ea12d7f99cff31        [paper, paper, albumen print, United Kingdom]
Name: name, Length: 23972, dtype: object

In [36]:
# 単語ベクトル表現の次元数
# 元の語彙数をベースに適当に決めました
model_size = {
    "material": 5,
    "collection": 3,
    "technique": 8,
    "production" : 20,
    "production_country" : 20,
    "material_collection": 10,
    "material_technique": 20,
    "collection_technique": 10,
    "material_collection_technique": 20,
    "material_collection_technique_production" : 25,
    "material_collection_technique_production_country" : 25,
}

n_iter = 100

In [37]:
w2v_dfs = []
for df, df_name in zip(
        [
            material, collection, technique, production, production_country,
            mat_col, mat_tec, col_tec, mat_col_tec, mat_col_tec_prod, mat_col_tec_prod_country
        ], [
            "material", "collection", "technique", "production", "production_country",
            "material_collection",
            "material_technique",
            "collection_technique",
            "material_collection_technique",
            "material_collection_technique_production",
            "material_collection_technique_production_country"
        ]):
    df_group = df.groupby("object_id")["name"].apply(list).reset_index()
    # Word2Vecの学習
    w2v_model = word2vec.Word2Vec(df_group["name"].values.tolist(),
                                  size=model_size[df_name],
                                  min_count=1,
                                  window=1,
                                  iter=n_iter)

    # 各文章ごとにそれぞれの単語をベクトル表現に直し、平均をとって文章ベクトルにする
    sentence_vectors = df_group["name"].progress_apply(
        lambda x: np.mean([w2v_model.wv[e] for e in x], axis=0))
    sentence_vectors = np.vstack([x for x in sentence_vectors])
    sentence_vector_df = pd.DataFrame(sentence_vectors,
                                      columns=[f"{df_name}_w2v_{i}"
                                               for i in range(model_size[df_name])])
    sentence_vector_df.index = df_group["object_id"]
    w2v_dfs.append(sentence_vector_df)

100%|██████████| 23586/23586 [00:00<00:00, 53900.11it/s]
100%|██████████| 14160/14160 [00:00<00:00, 58270.91it/s]
100%|██████████| 17329/17329 [00:00<00:00, 55175.77it/s]
100%|██████████| 15547/15547 [00:00<00:00, 57156.42it/s]
100%|██████████| 15547/15547 [00:00<00:00, 54446.97it/s]
100%|██████████| 23597/23597 [00:00<00:00, 46030.07it/s]
100%|██████████| 23950/23950 [00:00<00:00, 49676.52it/s]
100%|██████████| 21646/21646 [00:00<00:00, 51693.41it/s]
100%|██████████| 23953/23953 [00:00<00:00, 48315.81it/s]
100%|██████████| 23972/23972 [00:00<00:00, 51507.30it/s]
100%|██████████| 23972/23972 [00:00<00:00, 50266.62it/s]


In [38]:
len(w2v_dfs)

11

In [40]:
w2v_dfs[0].to_csv('../material_w2v_fix_material.csv')

In [42]:
#w2v_dfs[1].to_csv('../collection_w2v.csv')

In [44]:
#w2v_dfs[2].to_csv('../technique_w2v.csv')

In [46]:
#w2v_dfs[3].to_csv('../prodcution_w2v.csv')

In [48]:
#w2v_dfs[4].to_csv('../production_country_w2v.csv')

In [50]:
w2v_dfs[5].to_csv('../material_collection_w2v_fix_material.csv')

In [52]:
w2v_dfs[6].to_csv('../material_technique_w2v_fix_material.csv')

In [54]:
#w2v_dfs[7].to_csv('../collection_technique_w2v.csv')

In [56]:
w2v_dfs[8].to_csv('../material_collection_technique_w2v_fix_material.csv')

In [58]:
w2v_dfs[9].to_csv('../material_collection_technique_production_w2v_fix_material.csv')

In [60]:
w2v_dfs[10].to_csv('../material_collection_technique_production_country_w2v_fix_material.csv')