In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_curve,roc_auc_score
import warnings
warnings.filterwarnings("ignore")  #忽略告警
sns.set(style='darkgrid', context='notebook')

In [None]:
df_train=pd.read_csv('../input/spaceship-titanic/train.csv')

In [None]:
df_train.head()

## 将Cabin转换成3列

In [None]:
df_train.Cabin=df_train.Cabin.fillna(df_train.Cabin.mode()[0])

In [None]:
df_train['Cabin Side']=df_train.Cabin.map(lambda x: x[-1]) 
df_train['Cabin Side']

In [None]:
df_train['Cabin Deck']=df_train.Cabin.map(lambda x: x[0])
df_train['Cabin Deck']

In [None]:
df_train['Cabin Num']=df_train.Cabin.map(lambda x: x[2])
df_train['Cabin Num']

In [None]:
cols_to_drop=['Name','PassengerId','Cabin']
df_train=df_train.drop(cols_to_drop,axis=1)

In [None]:
df_train.head()

In [None]:
df_train.shape

In [None]:
df_train.info()

In [None]:
df_train.describe()

## 数据可视化

In [None]:
df_train['Transported'].value_counts().plot.pie(autopct='%0.2f%%')

In [None]:
sns.countplot(y=df_train.HomePlanet)

In [None]:
sns.countplot(y=df_train.CryoSleep)

In [None]:
sns.countplot(y=df_train.Destination)

In [None]:
sns.countplot(y=df_train.VIP)

In [None]:
sns.histplot(x=df_train.Age,kde=True)

In [None]:
corr=df_train.corr()
plt.figure(figsize=(12,9))
sns.heatmap(corr,cmap = "icefire",annot=True,fmt=".2f")

In [None]:
df_test=pd.read_csv('../input/spaceship-titanic/test.csv')

In [None]:
df_test.Cabin=df_test.Cabin.fillna(df_test.Cabin.mode()[0])

In [None]:
df_test['Cabin Side']=df_test.Cabin.map(lambda x: x[-1]) 
df_test['Cabin Side']

In [None]:
df_test['Cabin Deck']=df_test.Cabin.map(lambda x: x[0])
df_test['Cabin Deck']

In [None]:
df_test['Cabin Num']=df_test.Cabin.map(lambda x: x[2])
df_test['Cabin Num']

In [None]:
df_final_test=df_test.drop(cols_to_drop,axis=1)

In [None]:
df_final_test.head()

In [None]:
df_final_test.shape

In [None]:
df_final_test.info()

In [None]:
df_final_test.describe()

## 模型数据处理

In [None]:
x=df_train.drop('Transported',axis=1)
y=df_train.Transported 

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_val,y_train,y_val=train_test_split(x,y,test_size=0.2,random_state=0)

In [None]:
y=y.astype('int')

In [None]:
obj_cols=[col for col in x_train.columns if df_train[col].dtypes=='object'] 
num_cols=[col for col in x_train.columns if df_train[col].dtypes=='float64'] 

## Pipeline and ColumnTransformer

In [None]:
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer #缺失值填充
from sklearn.preprocessing import StandardScaler,OneHotEncoder #归一化及编码分类变量
from sklearn.compose import ColumnTransformer #数据转换

num_preprocess=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='mean')),
    ('scaling',StandardScaler())
])

obj_preprocess=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('encode',OneHotEncoder(handle_unknown='ignore'))
    
])

preprocessor= ColumnTransformer(transformers=[
    ('num', num_preprocess,num_cols),
    ('obj', obj_preprocess,obj_cols)
])

## 使用KNN进行预测

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [None]:
knn_pipe=Pipeline(steps=[
    ('preprocess',preprocessor),
    ('model',knn)
])

In [None]:
knn_pipe.fit(x_train,y_train)
accuracy = knn_pipe.score(x_val,y_val)
y_predict = knn_pipe.predict(x_val)

In [None]:
#打印真实值与预测值
y_predict_df = pd.DataFrame(y_predict,columns=['y_predict'],index=y_val.index)
y_val_predict_df = pd.concat([y_val,y_predict_df],axis = 1)
print('真实值与预测值','-'*30,'\n',y_val_predict_df)

In [None]:
print('The accuracy of the KNN is:',accuracy)

In [None]:
## 利用混淆矩阵来评估模型效果
confusion_matrix_result = confusion_matrix(y_predict,y_val)
print('The confusion matrix result:\n',confusion_matrix_result)

# 利用热力图对于结果进行可视化
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix_result, annot=True, cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title("Confusion matrix")
plt.show()

In [None]:
# 评估报告
from sklearn.metrics import classification_report
print(classification_report(y_val, y_predict))

In [None]:
# Plot the ROC Curve
from sklearn.metrics import roc_curve 
from sklearn.metrics import roc_auc_score 

fpr_test, tpr_test, thresholds_test = roc_curve(y_val, y_predict)
auc_test = roc_auc_score(y_val, y_predict)

plt.plot(fpr_test, tpr_test, 'g-',label ='AUC:%.3f'%auc_test)
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')  
plt.legend()
plt.show()

In [None]:
k_range = range(1, 51)
k_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn_pipe=Pipeline(steps=[
    ('preprocess',preprocessor),
    ('model',knn)
])
    knn_pipe.fit(x_train, y_train)
    score = knn_pipe.score(x_val, y_val)
    k_scores.append(score)

plt.plot(k_range, k_scores)
plt.xlabel('Vlaue of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

#### 由图可知，最优k为8

## 模型优化

In [None]:
#建模
clf = KNeighborsClassifier(8)
clf_pipe=Pipeline(steps=[
    ('preprocess',preprocessor),
    ('model',clf)
])
clf_pipe = clf_pipe.fit(x_train, y_train)
#预测
y_best_predict = clf_pipe.predict(x_val)
#正确率
accuracy = clf_pipe.score(x_val, y_val)
print('The accuracy of the KNN is:', accuracy)

### 开始预测

In [None]:
df_test_pred_knn=clf_pipe.predict(df_final_test) 

In [None]:
knn_submission=pd.DataFrame({'PassengerId':df_test.PassengerId, 'Transported':df_test_pred_knn})
knn_submission

In [None]:
knn_submission.Transported=knn_submission.Transported.replace({0:'False', 1:'True'})

In [None]:
knn_submission['Transported'].value_counts()

In [None]:
#knn_submission.to_csv('knn_submission.csv',index=False)

## 使用SVM进行预测

In [None]:
from sklearn.svm import SVC
svm_model = SVC()

In [None]:
svm_pipe=Pipeline(steps=[
    ('preprocess',preprocessor),
    ('model',svm_model)
])

In [None]:
svm_pipe.fit(x_train,y_train)   #用训练数据拟合
accuracy = svm_pipe.score(x_val,y_val)
y_predict_svm = svm_pipe.predict(x_val)
print('The accuracy of the SVM is:',accuracy)

In [None]:
svm_pipe.fit(x,y)

In [None]:
## 利用混淆矩阵来评估模型效果
confusion_matrix_result = confusion_matrix(y_predict_svm,y_val)
print('The confusion matrix result:\n',confusion_matrix_result)

# 利用热力图对于结果进行可视化
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix_result, annot=True, cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title("Confusion matrix")
plt.show()

In [None]:
# Plot the ROC Curve
from sklearn.metrics import roc_curve 
from sklearn.metrics import roc_auc_score 

fpr_test, tpr_test, thresholds_test = roc_curve(y_val, y_predict_svm)
auc_test = roc_auc_score(y_val, y_predict_svm)

plt.plot(fpr_test, tpr_test, 'g-',label ='AUC:%.3f'%auc_test)
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')  
plt.legend()
plt.show()

In [None]:
df_test_pred_svm=svm_pipe.predict(df_test) 

In [None]:
svm_submission=pd.DataFrame({'PassengerId':df_test.PassengerId, 'Transported':df_test_pred_svm})
svm_submission

In [None]:
svm_submission.Transported=svm_submission.Transported.replace({0:'False', 1:'True'})

In [None]:
svm_submission['Transported'].value_counts()

In [None]:
# svm_submission.to_csv('svm_submission.csv',index=False)