In [None]:
#匯入套件
import numpy as np # 線性運算
import pandas as pd # 資料處理、資料的容器

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#資料導入
train=pd.read_csv("../input/spaceship-titanic/train.csv")
test=pd.read_csv("../input/spaceship-titanic/test.csv")

In [None]:
#看前20筆資料
train.head(20)

In [None]:
#匯入畫圖套件(將資料視覺化)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#畫圖
sns.catplot(x="HomePlanet",data=train,kind='count',hue='Transported')
sns.catplot(x="Destination",data=train,kind='count',hue='Transported')
sns.catplot(x="VIP",data=train,kind='count',hue='Transported')
sns.catplot(x="Transported",data=train,kind='count')


In [None]:
#將'Dec','Num','Side'這三個colume中的元素(string)依/做分割，再做成一個dataframe
train[['Dec','Num','Side']]=train.Cabin.str.split('/',expand=True)
#將'Group_No','People'這三個colume中的元素(string)依_做分割，再做成一個dataframe
train[['Group_No','People']]=train.PassengerId.str.split('_',expand=True)
#將資料型態字中的串轉為浮點數
train.People=train.People.astype('float64')

In [None]:
#將空缺值給予其colume的眾數
train['HomePlanet'].fillna(train['HomePlanet'].mode()[0],inplace=True)
train['CryoSleep'].fillna(train['CryoSleep'].mode()[0],inplace=True)
train['Destination'].fillna(train['Destination'].mode()[0],inplace=True)
train['VIP'].fillna(train['VIP'].mode()[0],inplace=True)
train['Side'].fillna(train['Side'].mode()[0],inplace=True)
train['Dec'].fillna(train['Dec'].mode()[0],inplace=True)

train['RoomService'].fillna(train['RoomService'].mean(),inplace=True)
train['FoodCourt'].fillna(train['FoodCourt'].mean(),inplace=True)
train['ShoppingMall'].fillna(train['ShoppingMall'].mean(),inplace=True)
train['Spa'].fillna(train['Spa'].mean(),inplace=True)
train['VRDeck'].fillna(train['VRDeck'].mean(),inplace=True)
train['Age'].fillna(train['Age'].mean(),inplace=True)

In [None]:
#創建新列
category=pd.cut(train.Age,bins=[0.0,17,25,65,99],labels=['Child','Mini_Adult','Adult','Elderly'])
train.insert(5,'Group',category)
train['Group'].fillna('Child',inplace=True)
#將個別的數目字(費用)相加起來
train['Paid']=train['RoomService']+train['FoodCourt']+train['ShoppingMall']+train['Spa']+train['VRDeck']

In [None]:
#匯入套件
#使用序數編碼器進行分類編碼
from sklearn.preprocessing import OrdinalEncoder
ordinal=OrdinalEncoder()
columns=['HomePlanet','CryoSleep','VIP','Dec','Side','Destination','Transported']
for col in columns:
    train[col]=ordinal.fit_transform(train[col].values.reshape(-1,1))
    
train['Group'].replace(['Child','Mini_Adult','Adult','Elderly'],[0,1,2,3],inplace=True)

In [None]:
#拿掉一些列
train=train.drop(['PassengerId',"Name",'Cabin','Group_No','Age','Num'],axis=1)

In [None]:
#匯入套件
from sklearn.ensemble import RandomForestClassifier #模型
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split #分割資料
from sklearn.gaussian_process.kernels import RBF
import mlxtend
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.inspection import permutation_importance

In [None]:
#將矩陣畫出來
fig,ax=plt.subplots(figsize=(10,9))
ax=sns.heatmap(train.corr(), annot=True, fmt=".2f",cmap='cool',ax=ax)

print (ax)

In [None]:
#將資料正歸化(百分比)
ntrain=(train-train.min())/(train.max()-train.min())

In [None]:
#將ntrain的數據放入X(除了Transported)並單獨將Transported放入Y
X=ntrain.drop("Transported",axis=1)
Y= ntrain["Transported"]

#分割資料(訓練和測試)
X_train, X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

In [None]:
#訓練模型
random_param={'max_depth':[8,10,15],'min_samples_split':[5,10,20,50],'max_features':['auto','sqrt','log2'],'warm_start':[True, False]}
shc=HalvingGridSearchCV(RandomForestClassifier(warm_start=True),random_param,factor=3, random_state=100).fit (X_train,Y_train)
random_forest= shc.best_estimator_
random_forest.fit(X_train,Y_train)
y_pred=random_forest.predict(X_test)

In [None]:
#對測試資料進行前處理(如同先前所做)
test[['Dec','Num','Side']]=test.Cabin.str.split('/',expand=True)

test[['Group_No','People']]=test.PassengerId.str.split('_',expand=True)
    
index=pd.DataFrame(test['PassengerId'])

test['HomePlanet'].fillna(test['HomePlanet'].mode()[0],inplace=True)
test['CryoSleep'].fillna(test['CryoSleep'].mode()[0],inplace=True)
test['Destination'].fillna(test['Destination'].mode()[0],inplace=True)
test['VIP'].fillna(test['VIP'].mode()[0],inplace=True)
test['Side'].fillna(test['Side'].mode()[0],inplace=True)
test['Dec'].fillna(test['Dec'].mode()[0],inplace=True)

test['RoomService'].fillna(test['RoomService'].mean(),inplace=True)
test['FoodCourt'].fillna(test['FoodCourt'].mean(),inplace=True)
test['ShoppingMall'].fillna(test['ShoppingMall'].mean(),inplace=True)
test['Spa'].fillna(test['Spa'].mean(),inplace=True)
test['VRDeck'].fillna(test['VRDeck'].mean(),inplace=True)
test['Age'].fillna(test['Age'].mean(),inplace=True)

from sklearn.preprocessing import OrdinalEncoder
ordinal=OrdinalEncoder()
columns=['HomePlanet','CryoSleep','VIP','Dec','Side','Destination']
for col in columns:
    test[col]=ordinal.fit_transform(test[col].values.reshape(-1,1))

category=pd.cut(test.Age,bins=[0.0,17,25,65,99],labels=['Child','Mini_Adult','Adult','Elderly'])
test.insert(5,'Group',category)
test['Group'].fillna('Child',inplace=True)

test['Group'].replace(['Child','Mini_Adult','Adult','Elderly'],[0,1,2,3],inplace=True)

test['Paid']=test['RoomService']+test['FoodCourt']+test['ShoppingMall']+test['Spa']+test['VRDeck']

test=test.drop(['PassengerId',"Name",'Cabin','Group_No','Num','Age'],axis=1)
test.People=test.People.astype('float64')

ntest=(test-test.min())/(test.max()-test.min())

In [None]:
#預測
prediction=index.join(pd.DataFrame(random_forest.predict(ntest).astype(int),columns=['Transported']))
prediction['Transported'].replace([1,0],['True','False'],inplace=True)
prediction.to_csv("submissionsPT2.csv",index=False)