In [25]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e6/sample_submission.csv
/kaggle/input/playground-series-s5e6/train.csv
/kaggle/input/playground-series-s5e6/test.csv


In [26]:
import matplotlib.pyplot as plt
import seaborn as sns

# 不要と思われる警告を消す
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# 機械学習用ライブラリ
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

In [27]:
full_train_data = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv') # 訓練データ
full_test_data = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')   # テストデータ

In [28]:
train_X = full_train_data.drop(columns=['id', 'Fertilizer Name'])
train_y = full_train_data['Fertilizer Name']

In [29]:
standard_scaler = StandardScaler()
ordinal_encoder = OrdinalEncoder()
label_encoder = LabelEncoder()
num_columns = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous'] # 数値データの列
cat_columns = ['Soil Type', 'Crop Type'] # 文字列データの列
train_X[num_columns] = standard_scaler.fit_transform(train_X[num_columns])

train_X[cat_columns] = ordinal_encoder.fit_transform(train_X[cat_columns])

train_y = label_encoder.fit_transform(train_y)


In [30]:
model = XGBClassifier(
    learning_rate = 0.3,
    max_depth = 6,
    min_child_weight = 1,
    subsample = 1.0,
    colsample_bytree = 1.0,
    
    n_estimators = 100, # どこまで学習を進めるか。途中終了するため、どれだけ大きくても良い。
    objective='multi:softprob',
    num_class=7, # 答えとなり得る肥料は7種類
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)

In [31]:
model.fit(
    train_X, train_y,
    verbose=True
)

In [32]:
test_X = full_test_data.drop(columns="id")
test_X[num_columns] = standard_scaler.transform(test_X[num_columns])
test_X[cat_columns] = ordinal_encoder.transform(test_X[cat_columns])

In [33]:
pred = model.predict_proba(test_X)

In [34]:
pd.DataFrame(pred, index=full_test_data.id, columns=label_encoder.classes_)

Unnamed: 0_level_0,10-26-26,14-35-14,17-17-17,20-20,28-28,DAP,Urea
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
750000,0.141774,0.123666,0.135087,0.139252,0.155025,0.169942,0.135254
750001,0.143144,0.134417,0.197400,0.187870,0.130958,0.092323,0.113888
750002,0.171544,0.175387,0.125457,0.146960,0.147800,0.117299,0.115553
750003,0.126287,0.220434,0.156781,0.111295,0.095406,0.157714,0.132084
750004,0.172640,0.107475,0.121617,0.235396,0.154713,0.116526,0.091631
...,...,...,...,...,...,...,...
999995,0.134148,0.156006,0.170036,0.151203,0.161116,0.112277,0.115214
999996,0.175874,0.194336,0.139462,0.142197,0.134673,0.092084,0.121374
999997,0.121122,0.189902,0.138220,0.129592,0.115635,0.174349,0.131180
999998,0.150508,0.138858,0.174179,0.124341,0.154145,0.116678,0.141292


In [35]:
first = label_encoder.inverse_transform(np.argsort(pred)[:, -1])
second = label_encoder.inverse_transform(np.argsort(pred)[:, -2])
third = label_encoder.inverse_transform(np.argsort(pred)[:, -3])

In [36]:
submission = pd.read_csv('/kaggle/input/playground-series-s5e6/sample_submission.csv')
submission # 提出ファイルのテンプレート

Unnamed: 0,id,Fertilizer Name
0,750000,14-35-14 10-26-26 Urea
1,750001,14-35-14 10-26-26 Urea
2,750002,14-35-14 10-26-26 Urea
3,750003,14-35-14 10-26-26 Urea
4,750004,14-35-14 10-26-26 Urea
...,...,...
249995,999995,14-35-14 10-26-26 Urea
249996,999996,14-35-14 10-26-26 Urea
249997,999997,14-35-14 10-26-26 Urea
249998,999998,14-35-14 10-26-26 Urea


In [37]:
submission['Fertilizer Name'] = first + ' ' + second + ' ' + third
submission.to_csv('beseline_submission.csv', index=False)
submission

Unnamed: 0,id,Fertilizer Name
0,750000,DAP 28-28 10-26-26
1,750001,17-17-17 20-20 10-26-26
2,750002,14-35-14 10-26-26 28-28
3,750003,14-35-14 DAP 17-17-17
4,750004,20-20 10-26-26 28-28
...,...,...
249995,999995,17-17-17 28-28 14-35-14
249996,999996,14-35-14 10-26-26 20-20
249997,999997,14-35-14 DAP 17-17-17
249998,999998,17-17-17 28-28 10-26-26
