In [1]:
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# csvをimportする

In [2]:
train = pd.read_csv(f"../data/train.csv", sep=',')
train.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [3]:
test = pd.read_csv(f"../data/test.csv", sep=',')
test.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,300000,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,300001,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,300002,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,300003,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,300004,,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953


In [4]:
extra_data = pd.read_csv(f"../data/training_extra.csv", sep=',')
extra_data.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,500000,Under Armour,Canvas,Small,10.0,Yes,Yes,Tote,Blue,23.882052,114.11068
1,500001,Puma,Polyester,Small,4.0,No,Yes,Backpack,Green,11.869095,129.74972
2,500002,Jansport,Polyester,Small,8.0,Yes,Yes,Tote,Red,8.092302,21.3737
3,500003,Nike,Nylon,Large,7.0,No,No,Messenger,Pink,7.719581,48.09209
4,500004,Nike,Leather,Large,9.0,No,Yes,Tote,Green,22.741826,77.32461


In [5]:
sample_submission = pd.read_csv(f"../data/sample_submission.csv", sep=',')
sample_submission.head()

Unnamed: 0,id,Price
0,300000,81.411
1,300001,81.411
2,300002,81.411
3,300003,81.411
4,300004,81.411


# カテゴリデータを調べる

In [6]:
df = pd.concat([train, test], ignore_index=True)
df.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [7]:
cat_columns = df.select_dtypes(include=["object", "category"])
cat_values = {col: set(df[col].dropna()) for col in cat_columns}

for col in cat_columns:
    print(f"{col}: {set(df[col].dropna())}")

Brand: {'Nike', 'Jansport', 'Puma', 'Adidas', 'Under Armour'}
Material: {'Polyester', 'Canvas', 'Leather', 'Nylon'}
Size: {'Medium', 'Large', 'Small'}
Laptop Compartment: {'No', 'Yes'}
Waterproof: {'No', 'Yes'}
Style: {'Tote', 'Messenger', 'Backpack'}
Color: {'Red', 'Gray', 'Blue', 'Pink', 'Green', 'Black'}


# label encoding する

In [8]:
df_encoded = df.copy()
label_encoders = {}
for col in cat_columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df[col])
    label_encoders[col] = le

df_encoded.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,1,1,1,7.0,1,0,2,0,11.611723,112.15875
1,1,1,0,2,10.0,1,1,1,3,27.078537,68.88056
2,2,4,1,2,2.0,1,0,1,5,16.64376,39.1732
3,3,2,2,2,8.0,1,0,1,3,12.93722,80.60793
4,4,0,0,1,1.0,1,1,1,3,17.749338,86.02312


# 欠損値を処理する

In [9]:
nan_counts = df_encoded.isna().sum()
print(nan_counts)

id                           0
Brand                        0
Material                     0
Size                         0
Compartments                 0
Laptop Compartment           0
Waterproof                   0
Style                        0
Color                        0
Weight Capacity (kg)       215
Price                   200000
dtype: int64


In [10]:
imputer = SimpleImputer(strategy='mean')
df_encoded['Weight Capacity (kg)'] = imputer.fit_transform(df_encoded[['Weight Capacity (kg)']])

# trainとtestに分離

In [11]:
train = df_encoded[df_encoded["id"] < 300000]
test = df_encoded[df_encoded["id"] >= 300000]

# preprocess_resultsにcsvを出力

In [12]:
path_train = f"../preprocess_results/P1_train.csv"
path_test = f"../preprocess_results/P1_test.csv"
train.to_csv(path_train, index=False)
test.to_csv(path_test, index=False)