In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train=pd.read_csv('../input/spaceship-titanic/train.csv')
train['isTrain']=1

In [None]:
test=pd.read_csv('../input/spaceship-titanic/test.csv')

test['isTrain']=0

In [None]:
dataset=train.append(test,sort=False)

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.isnull().sum()

In [None]:
dataset[['group','member']]=dataset.PassengerId.str.split('_',expand=True)

In [None]:
sns.countplot(dataset.CryoSleep)

In [None]:
dataset[['deck','num','side']]=dataset.Cabin.str.split('/',expand=True)
dataset.head()

In [None]:
sns.countplot(dataset.side)

In [None]:
sns.countplot(dataset.deck)

In [None]:
plt.pie(train.Transported.value_counts(),autopct='%1.1f%%')  #成功与否比例
plt.show()

In [None]:
sns.countplot(train.HomePlanet)

In [None]:
sns.countplot(dataset.Destination)

In [None]:
sns.countplot(x='Age',hue='Transported',data=dataset)

In [None]:
sns.countplot(x='VIP',hue='Transported',data=dataset)

In [None]:
#处理HomePlanet数据,特征值home
dataset.HomePlanet=dataset.HomePlanet.fillna('NoRecord')
home={'Europa':0,'Earth':1,'Mars':2,'NoRecord':3}
dataset['Home']=dataset.HomePlanet.map(home)

In [None]:
#处理CryoSleep数据，特征值Cryosleep
dataset.CryoSleep=dataset.CryoSleep.fillna('NoRecord')
sleep={True:1,False:0,'NoRecord':2}
dataset['CryoSleep']=dataset.CryoSleep.map(sleep)


In [None]:
#处理Cabin数据，特征值deck，num，side
dataset.num=dataset.num.fillna('2000')
dataset['num']=dataset['num'].astype(int)
dataset.deck=dataset.deck.fillna('H')
dataset.side=dataset.side.fillna('N')



In [None]:
deck={'A':0,'B':1,'C':2,'D':3,'E':4,'F':5,'G':6,'T':7,'H':8}
dataset.deck=dataset.deck.map(deck)
side={'P':0,'S':1,'N':2}
dataset.side=dataset.side.map(side)

In [None]:
dataset.Destination.head(20)

In [None]:
#处理Destination数据，特征值Destination
dataset.Destination=dataset.Destination.fillna('NoRecord')
destination={'TRAPPIST-1e':0,'PSO J318.5-22':1,'55 Cancri e':2,'NoRecord':3}
dataset.Destination=dataset.Destination.map(destination)

In [None]:
#处理Age数据，特征值Age
dataset.Age=dataset.Age.fillna(dataset.Age.mean())


In [None]:
#处理VIP数据，特征值VIP
dataset.VIP=dataset.VIP.fillna('NoRecord')
vip={True:1,False:0,'NoRecord':2}
dataset.VIP=dataset.VIP.map(vip)

In [None]:
#处理消费列，CryoSleep的人全部设为0
def fill_zero(column):
    dataset[column]=[0 if a==1 else b for a,b in zip(dataset.CryoSleep,dataset[column])]
    return dataset
for i in ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']:
    dataset=fill_zero(i)

In [None]:
#空值填为0
def fill_na(column):
    mean=dataset[column].mean()
    dataset[column]=dataset[column].fillna(mean)
    return dataset
for i in ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']:
    dataset=fill_na(i)

In [None]:
#luxury列作为所有花费加和
dataset['luxury']=dataset[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].sum(axis=1)


In [None]:
#处理Name列
dataset.Name=dataset.Name.fillna('NoRecord')

In [None]:
#判断是否为family
dataset['last_name']=[a[1] if len(a)>1 else 'NoRecord' for a in dataset.Name.str.split(' ')]


In [None]:
dataset['group_size']=dataset.group.map(dataset.groupby(['group'])['PassengerId'].count().to_dict())

In [None]:
#判断是否含last_name重名，预示着一家人
dataset = pd.merge(dataset,dataset.groupby(['group','last_name']).group_size.count() , on=['group','last_name'])


In [None]:
dataset=dataset.rename(columns={'group_size_x':'group_size','group_size_y':'same_name'})
dataset['family']=[1 if a>1 and a==b else 0 for a,b in zip(dataset.group_size,dataset.same_name)]

In [None]:
#对group进行编码
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
dataset['group'] = encoder.fit_transform(dataset['group'])

In [None]:
dataset.info()

In [None]:
for i in ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']:
    dataset[i]=dataset[i].astype(int)
dataset['member']=dataset['member'].astype(int)


In [None]:
train = dataset.query('isTrain==1')
test=dataset.query('isTrain==0')

In [None]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import roc_auc_score as auc
from sklearn.model_selection import StratifiedKFold

In [None]:
train.info()

In [None]:
x_train=train
y_train=train.Transported
y_train=y_train.astype('int')
x_test=test

In [None]:
y_train=y_train.astype('int')

In [None]:
predictors=['CryoSleep','Destination','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','group','member','deck','num','side','Home','luxury','group_size','family']

In [None]:
#X_train,X_valid,Y_train,Y_valid=train_test_split(x_train[predictors],y_train.astype('int'),train_size=0.8,random_state=0)

In [None]:
params = {

    'learning_rate': 0.05,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'verbose': 1,
    'seed': 2222,
    'n_jobs': 4,
}
fold_num = 5
seeds = [2222]
oof = np.zeros(len(x_train))
importance = 0
pred_y = pd.DataFrame()
for seed in seeds:
    kf = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=seed)
    for fold, (train_idx, val_idx) in enumerate(kf.split(x_train[predictors], y_train)):
        print('-----------', fold)
        l_train = lgb.Dataset(x_train.loc[train_idx, predictors],
                            y_train.loc[train_idx,])
        l_val = lgb.Dataset(x_train.loc[val_idx, predictors],
                          y_train.loc[val_idx])
        model = lgb.train(params, l_train, valid_sets=l_val, num_boost_round=10000,
                          early_stopping_rounds=100, verbose_eval=200)
        
        oof[val_idx] += model.predict(x_train.loc[val_idx, predictors]) / len(seeds)
        pred_y['fold_%d_seed_%d' % (fold, seed)] = model.predict(x_test[predictors])
        importance += model.feature_importance(importance_type='gain') / fold_num
print('AUC... ',auc(y_train, oof))

In [None]:
#model = LogisticRegression()
#model = XGBClassifier(n_estimators=200,learning_rate=0.05,random_state=0)
#model = lgb.train(params, l_train, valid_sets=l_val,early_stopping_rounds=5,num_boost_round=10000)

In [None]:
#model.fit(x_train[predictors],y_train.astype(int))

#model.fit(X_train,Y_train.astype(int),eval_set=[(X_valid, Y_valid)],early_stopping_rounds=5)

In [None]:
# predictions = model.predict(x_test[predictors])
# predictions=pd.Series([1 if i >=0.5 else 0 for i in predictions])
# result = pd.DataFrame({'PassengerId':x_test['PassengerId'].values, 'Transported':predictions.astype(bool)})
# result.to_csv("./submission.csv", index=False)
# result.Transported.value_counts()

In [None]:
pred = pred_y.mean(axis=1)
pred=pd.Series([1 if i >=0.5 else 0 for i in pred])
submit = pd.DataFrame({'PassengerId':x_test['PassengerId'].values, 'Transported':pred.astype(bool)})


In [None]:
submit.to_csv("./submission.csv", index=False)
submit.Transported.value_counts()