In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
#必要なライブラリをインポート

In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data.head()
#データの読み込み・確認

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
train_hp_dummies = pd.get_dummies(train_data['HomePlanet'], prefix='HomePlanet')
train_data = pd.concat([train_data, train_hp_dummies], axis=1)
#HomePlanetをダミー変数に変換
 
test_hp_dummies = pd.get_dummies(test_data['HomePlanet'], prefix='HomePlanet')
test_data = pd.concat([test_data, test_hp_dummies], axis=1)
#testデータも同様に変換


In [5]:
train_data[['Deck', 'Num', 'Side']] = train_data['Cabin'].str.split('/', expand=True)
#Cabinを３つの特徴量に変換

deck_dummies = pd.get_dummies(train_data['Deck'], prefix='Deck_Type')
train_data = pd.concat([train_data,deck_dummies], axis=1)
#Deckをダミー変数に変換

train_data['Side'] = train_data['Side'].map({'P': 0, 'S': 1})
#Sideをint(0,1)に変換


test_data[['Deck', 'Num', 'Side']] = test_data['Cabin'].str.split('/', expand=True)

tdeck_dummies = pd.get_dummies(test_data['Deck'], prefix='Deck_Type')
test_data = pd.concat([test_data,tdeck_dummies], axis=1)

test_data['Side'] = test_data['Side'].map({'P': 0, 'S': 1})
#testデータも同様に調整

In [6]:
train_data['CryoSleep'] = train_data['CryoSleep'].astype(bool)
train_data['VIP'] = train_data['VIP'].astype(bool)
#CryoSleepとVIPはbooleanに変換


test_data['CryoSleep'] = test_data['CryoSleep'].astype(bool).astype(int)
test_data['VIP'] = test_data['VIP'].astype(bool)
#testデータも同様に調整

In [7]:
train_data.head()
#一度、それぞれのデータを確認

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Num,Side,Deck_Type_A,Deck_Type_B,Deck_Type_C,Deck_Type_D,Deck_Type_E,Deck_Type_F,Deck_Type_G,Deck_Type_T
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,0,0.0,False,True,False,False,False,False,False,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,0,1.0,False,False,False,False,False,True,False,False
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,0,1.0,True,False,False,False,False,False,False,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,...,0,1.0,True,False,False,False,False,False,False,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,1,1.0,False,False,False,False,False,True,False,False


In [8]:
test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Num,Side,Deck_Type_A,Deck_Type_B,Deck_Type_C,Deck_Type_D,Deck_Type_E,Deck_Type_F,Deck_Type_G,Deck_Type_T
0,0013_01,Earth,1,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,...,3,1.0,False,False,False,False,False,False,True,False
1,0018_01,Earth,0,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,...,4,1.0,False,False,False,False,False,True,False,False
2,0019_01,Europa,1,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,...,0,1.0,False,False,True,False,False,False,False,False
3,0021_01,Europa,0,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,...,1,1.0,False,False,True,False,False,False,False,False
4,0023_01,Earth,0,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,...,5,1.0,False,False,False,False,False,True,False,False


In [9]:
y_train_val = train_data['Transported']
y_train_val.head()
#ターゲットにTransportedを設定


0    False
1     True
2    False
3    False
4     True
Name: Transported, dtype: bool

In [10]:
x_train_val= train_data.drop(['PassengerId', 'HomePlanet', 'Cabin', 'Destination', 'Name', 'Transported','Deck', 'Deck_Type_T'], axis=1)
#必要のない特徴量を落とし、x_train_valを設定

x_test= test_data.drop(['PassengerId', 'HomePlanet', 'Cabin', 'Destination', 'Name','Deck', 'Deck_Type_T'], axis=1)
#testデータも同様

In [11]:
x_train_val['Num'] = x_train_val['Num'].fillna(0).astype(int)
x_train_val['Num_Group'] = x_train_val['Num'] // 100
x_train_val = x_train_val.drop(['Num'], axis =1)
#Numは数字の範囲が大きいので、１００ごとのグループで表示

x_test['Num'] = x_test['Num'].fillna(0).astype(int)
x_test['Num_Group'] = x_test['Num'] // 100
x_test = x_test.drop(['Num'], axis =1)
#testデータも同様

In [12]:
x_train_val.head()
#調整が終わったので、最終的なx_trainを確認する

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Side,Deck_Type_A,Deck_Type_B,Deck_Type_C,Deck_Type_D,Deck_Type_E,Deck_Type_F,Deck_Type_G,Num_Group
0,False,39.0,False,0.0,0.0,0.0,0.0,0.0,False,True,False,0.0,False,True,False,False,False,False,False,0
1,False,24.0,False,109.0,9.0,25.0,549.0,44.0,True,False,False,1.0,False,False,False,False,False,True,False,0
2,False,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,True,False,1.0,True,False,False,False,False,False,False,0
3,False,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,True,False,1.0,True,False,False,False,False,False,False,0
4,False,16.0,False,303.0,70.0,151.0,565.0,2.0,True,False,False,1.0,False,False,False,False,False,True,False,0


In [13]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.3, random_state=1)
#trainとvalにそれぞれ分ける

In [14]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score
#lightGBMをインポート

In [15]:
lgb_model = lgb.LGBMClassifier(n_estimators=500, learning_rate=0.05, random_state=42)
#任意のハイパーパラメータで、モデルを定義

In [16]:
lgb_model.fit(x_train, y_train)
#モデルを適用

[LightGBM] [Info] Number of positive: 3061, number of negative: 3024
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000852 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1399
[LightGBM] [Info] Number of data points in the train set: 6085, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503040 -> initscore=0.012161
[LightGBM] [Info] Start training from score 0.012161


In [17]:
y_pred = lgb_model.predict(x_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))
#validationデータでのスコアを確認

Validation Accuracy: 0.8059815950920245


In [18]:
from sklearn.model_selection import GridSearchCV
#GridSearchをインポート

In [19]:
param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7]
}
#param_gridを定義

In [20]:
grid = GridSearchCV(lgb.LGBMClassifier(random_state=42), param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid.fit(x_train, y_train)
#GridSearchを実行

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[LightGBM] [Info] Number of positive: 2041, number of negative: 2016
[LightGBM] [Info] Number of positive: 2040, number of negative: 2016
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001317 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001532 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1396
[LightGBM] [Info] Number of data points in the train set: 4057, number of used features: 20
[LightGBM] [Info] Total Bins 1398
[LightGBM] [Info] Number of data points in the train set: 4056, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503081 -> initscore=0.012325
[LightGBM] [Info] 

In [21]:
print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)
#一番スコアが良いパラメータを確認

Best parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100}
Best score: 0.8032863550833752


In [22]:
best_lgb = lgb.LGBMClassifier(n_estimators=300, learning_rate=0.1, max_depth=3, random_state=42)
best_lgb.fit(x_train, y_train)
#そのパラメータでmodelを定義・適用

[LightGBM] [Info] Number of positive: 3061, number of negative: 3024
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001020 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1399
[LightGBM] [Info] Number of data points in the train set: 6085, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503040 -> initscore=0.012161
[LightGBM] [Info] Start training from score 0.012161


In [23]:
predictions = best_lgb.predict(x_test)
#x_testから結果を予測

In [24]:
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Transported': predictions
})
submission.head()
#submissionとして予測のデータを作成

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [25]:
submission.to_csv('submission_lgb.csv', index=False)
#提出用のcsvファイルを作成