# Corrections to notation errors and EDA for MetaData
# メタデータに対するEDAと表記揺れの修正
- Fixing data inconsistencies / データの表記揺れを修正する
- Japanese notation is corrected to English / 日本語表記は英語に直す
- Eventually, the metadata modification will be made into a function so that it can be reused in other Notebook. / 最終的には、メタデータの修正を関数化して、他のNotebookでも再利用できるようにする。
- If there are any mistakes, please point them out to me.　/ 間違いなどがあればご指摘頂きたいです。

In [1]:
import pandas as pd
import os 
os.chdir("/kaggle/input/hah-data-science-challenge/")
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt

import pandas_profiling

In [1]:
df_train = pd.read_csv("train.csv", index_col=False)
df_test = pd.read_csv("test.csv", index_col=False)

print(df_train.shape)
print(df_test.shape)

In [1]:
df_train.head()

In [1]:
df_test.head()

# First of all, let's make a Pandas_profile of the distributed data as it is
# まずは配布データをそのままPandas_profileする
- There is a discrepancy in the notation of “bolt," "plate," "recording method," and "microphone distance."
- "ねじ", "プレート", "録音方法", "マイク距離"に表記ゆれがある

In [1]:
pandas_profiling.ProfileReport(df_train)

In [1]:
pandas_profiling.ProfileReport(df_test)

# Correct the contents of each column and fix the notation errors
# 各カラムの内容を修正し、表記揺れを直す

In [1]:
tgt_col = ["ねじ", 'プレート', '録音方法', 'マイク距離']
for col in tgt_col:#Targetは変換対象外
    train_component = sorted(df_train[col].unique())
    test_component = sorted(df_test[col].unique())
    
    and_component = list(train_component)
    and_component.extend(list(test_component))
    and_component = sorted(list(set(and_component)))
    
    print(col)
    print("train_component : ", train_component)
    print("test_component : ", test_component)
    print("and_component", and_component)
    print("")

In [1]:
#各データ修正用の辞書
bolt_dict = {
    '大':"big",
    '小':"small"
}

plate_dict = {
    '大':"big",
    '小':"small"
}

record_dict = {
    'PC内臓':"pc_built_in",
    'PC内蔵':"pc_built_in",
    'USB1':"usb1", 
    'USB2':"usb2", 
    'USB3':"usb3", 
    'USB4':"usb4", 
    'スマホ':"smart_phone",
    'スマホのボイスレコーダ':"smart_phone",
    '内蔵マイク':"pc_built_in",
    }

distance_dict = {
    '10cm': 0.1, 
    '10㎝': 0.1, 
    '1M': 1.0, 
    '20cm': 0.2, 
    '20㎝': 0.2, 
    '2M': 2.0, 
    '2m': 2.0, 
    '30cm': 0.3, 
    '30cn': 0.3, 
    '30㎝': 0.3, 
    '3m': 3.0, 
    '40cm': 0.4, 
    '40㎝': 0.4, 
    '50cm': 0.5, 
    '50㎝': 0.5, 
    '5cm': 0.05,
    '8cm': 0.08, 
    '１Ｍ': 1.0   
}

cvt_dict = {
    "ねじ" : bolt_dict, 
    'プレート' : plate_dict, 
    '録音方法' : record_dict, 
    'マイク距離' : distance_dict
}

In [1]:
#各dfのデータを置換する
for col in tgt_col:#Targetは変換対象外
    df_train[col] = df_train[col].map(cvt_dict[col])
    df_test[col] = df_test[col].map(cvt_dict[col])

In [1]:
#カラム名の変更
#df_train日本語カラム名 : ['ID', 'ねじ', 'プレート', '録音方法', 'マイク距離', 'ファイル', 'Target']
col_train = ['id', 'bolt', 'plate', 'record', 'mic_dist', 'file', 'target']
#df_test日本語カラム名 : ['ID', 'ねじ', 'プレート', '録音方法', 'マイク距離', 'ファイル', 'Target']
col_test = ['id', 'bolt', 'plate', 'record', 'mic_dist', 'file']

df_train.columns = col_train
df_test.columns = col_test

In [1]:
df_train.head()

In [1]:
df_test.head()

# Pandas_profile for the modified data
# 修正したデータに対してPandas_profileする

In [1]:
pandas_profiling.ProfileReport(df_train)

In [1]:
pandas_profiling.ProfileReport(df_test)

# Compare the distribution of training and test data
# 学習データとテストデータの分布を比較する

In [1]:
fig, axs = plt.subplots(2, 4, figsize=(25, 12))
sns.countplot(x='bolt', 
              order=["big", "small"],
              data=df_train, 
              ax=axs[0,0])
axs[0,0].set_title("train_bolt")
sns.countplot(x='bolt', 
              order=["big", "small"],
              data=df_test, 
              ax=axs[1,0])
axs[1,0].set_title("test_bolt")


sns.countplot(x = 'plate', 
              order=["big", "small"],
              data=df_train, 
              ax=axs[0,1])
axs[0,1].set_title("train_plate")
sns.countplot(x = 'plate', 
              order=["big", "small"],
              data=df_test, 
              ax=axs[1,1])
axs[1,1].set_title("test_plate")

sns.countplot(x = 'record', 
              order=["smart_phone", "usb4", "pc_built_in", "usb1", "usb2", "usb3"],
              data=df_train, ax=axs[0,2])
axs[0,2].set_title("train_record")
sns.countplot(x = 'record', 
              order=["smart_phone", "usb4", "pc_built_in", "usb1", "usb2", "usb3"],
              data=df_test, ax=axs[1,2])
axs[1,2].set_title("test_record")

sns.histplot(x = 'mic_dist', data=df_train, ax=axs[0,3])
axs[0,3].set_title("train_mic_dist")
axs[0,3].set_xlim([-0.1, 3.2])

sns.histplot(x = 'mic_dist', data=df_test, ax=axs[1,3])
axs[1,3].set_title("test_mic_dist")
axs[1,3].set_xlim([-0.1, 3.2])

plt.tight_layout()

# Make it a function so that it can be reused in other code.
# 関数にして、他のコードで再利用できるようにする。

In [1]:
def meta_define():
    """
    input : none
    output : Corrected metadata
    """
    import pandas as pd
    import os 
    os.chdir("/kaggle/input/hah-data-science-challenge/")
    df_train = pd.read_csv("train.csv", index_col=False)
    df_test = pd.read_csv("test.csv", index_col=False)
    
    ##################################################
    #以下辞書や変数の定義
    #各データ修正用の辞書
    bolt_dict = {
        '大':"big",
        '小':"small"
    }

    plate_dict = {
        '大':"big",
        '小':"small"
    }

    record_dict = {
        'PC内臓':"pc_built_in",
        'PC内蔵':"pc_built_in",
        'USB1':"usb1", 
        'USB2':"usb2", 
        'USB3':"usb3", 
        'USB4':"usb4", 
        'スマホ':"smart_phone",
        'スマホのボイスレコーダ':"smart_phone",
        '内蔵マイク':"pc_built_in",
        }

    distance_dict = {
        '10cm': 0.1, 
        '10㎝': 0.1, 
        '1M': 1.0, 
        '20cm': 0.2, 
        '20㎝': 0.2, 
        '2M': 2.0, 
        '2m': 2.0, 
        '30cm': 0.3, 
        '30cn': 0.3, 
        '30㎝': 0.3, 
        '3m': 3.0, 
        '40cm': 0.4, 
        '40㎝': 0.4, 
        '50cm': 0.5, 
        '50㎝': 0.5, 
        '5cm': 0.05,
        '8cm': 0.08, 
        '１Ｍ': 1.0   
    }

    cvt_dict = {
        "ねじ" : bolt_dict, 
        'プレート' : plate_dict, 
        '録音方法' : record_dict, 
        'マイク距離' : distance_dict
    }
    
    #df_train日本語カラム名 : ['ID', 'ねじ', 'プレート', '録音方法', 'マイク距離', 'ファイル', 'Target']
    col_train = ['id', 'bolt', 'plate', 'record', 'mic_dist', 'file', 'target']
    #df_test日本語カラム名 : ['ID', 'ねじ', 'プレート', '録音方法', 'マイク距離', 'ファイル', 'Target']
    col_test = ['id', 'bolt', 'plate', 'record', 'mic_dist', 'file']
    
    tgt_col = ["ねじ", 'プレート', '録音方法', 'マイク距離']
    ##################################################
    
    for col in tgt_col:#Targetは変換対象外
        df_train[col] = df_train[col].map(cvt_dict[col])
        df_test[col] = df_test[col].map(cvt_dict[col])
        
    df_train.columns = col_train
    df_test.columns = col_test
    
    return df_train, df_test

In [1]:
#operation check
df_train, df_test = meta_define()

In [1]:
df_train.head()
df_test.head()