In [2]:
import pandas as pd
train_df = pd.read_csv("train.csv", encoding="utf-8")
test_df = pd.read_csv("test.csv", encoding="utf-8")

In [3]:
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
med = train_df.median()
train_df = train_df.fillna(med)
test_df = test_df.fillna(med)

In [5]:
most = train_df["Embarked"].value_counts().idxmax()
train_df["Embarked"] = train_df["Embarked"].fillna(most)
test_df["Embarked"] = test_df["Embarked"].fillna(most)

In [6]:
n = "Braund, Mr. Owen Harris"
n = n.split(",")[-1].split(".")[0]
# strip() 刪除空白
n.strip()

'Mr'

In [7]:
def nameflow(n):
    n = n.split(",")[-1].split(".")[0]
    return n.strip()
mid = train_df["Name"].apply(nameflow).value_counts()
reserved = mid[mid > 30].index
reserved

Index(['Mr', 'Miss', 'Mrs', 'Master'], dtype='object')

In [8]:
# apply
def nameflow2(n, r):
    n = n.split(",")[-1].split(".")[0]
    n = n.strip()
    if n in reserved:
        return n
    else:
        return None
train_df["Name"] = train_df["Name"].apply(nameflow2, args=(reserved,))
test_df["Name"] = test_df["Name"].apply(nameflow2, args=(reserved,))

In [9]:
# one-hot encoding
# 加上 , dummy_na=True => list 裡有 none 會額外顯示出一欄
x_train_nodrop = pd.get_dummies(train_df, columns=["Name", "Sex", "Embarked"])
x_test_nodrop = pd.get_dummies(test_df, columns=["Name", "Sex", "Embarked"])

In [10]:
x_train = x_train_nodrop.drop(["PassengerId", "Survived", "Cabin", "Ticket"], 
                               axis=1)
y_train = x_train_nodrop["Survived"]
x_test = x_test_nodrop.drop(["PassengerId", "Cabin", "Ticket"], 
                               axis=1)
testid = x_test_nodrop["PassengerId"]
x_train

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Name_Master,Name_Miss,Name_Mr,Name_Mrs,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.2500,0,0,1,0,0,1,0,0,1
1,1,38.0,1,0,71.2833,0,0,0,1,1,0,1,0,0
2,3,26.0,0,0,7.9250,0,1,0,0,1,0,0,0,1
3,1,35.0,1,0,53.1000,0,0,0,1,1,0,0,0,1
4,3,35.0,0,0,8.0500,0,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,0,0,0,0,0,1,0,0,1
887,1,19.0,0,0,30.0000,0,1,0,0,1,0,0,0,1
888,3,28.0,1,2,23.4500,0,1,0,0,1,0,0,0,1
889,1,26.0,0,0,30.0000,0,0,1,0,0,1,1,0,0


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
params = {
    "n_estimators":range(20, 40, 2),
    "max_depth":range(8,10)
}
clf = RandomForestClassifier()
cv = GridSearchCV(clf, params, cv=10, n_jobs=4)
cv.fit(x_train, y_train)
print(cv.best_params_)
print(cv.best_score_)

{'max_depth': 8, 'n_estimators': 32}
0.8373158551810237


In [12]:
# api 中有 n_job 可以同時執行
from sklearn.model_selection import cross_val_score
import numpy as np
clf = RandomForestClassifier(n_estimators= 40, max_depth=8)
score = cross_val_score(clf, x_train, y_train, cv=10, n_jobs=4)
print(score)
print(np.average(score))

[0.78888889 0.85393258 0.73033708 0.88764045 0.91011236 0.84269663
 0.80898876 0.7752809  0.8988764  0.86516854]
0.8361922596754058


In [13]:
# 最佳參數
clf = RandomForestClassifier(n_estimators=40, max_depth=8)
clf.fit(x_train, y_train)
pre = clf.predict(x_test)
result_df = pd.DataFrame({
    "PassengerId":testid,
    "Survived":pre
})
result_df.to_csv("rf.csv", index=False, encoding="utf-8")
result_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [14]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train_scale = scaler.fit_transform(x_train)
x_test_scale = scaler.transform(x_test)

In [15]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
params = {
    "n_neighbors":range(5, 50)
}
cv = GridSearchCV(clf, params, cv=10, n_jobs=4)
cv.fit(x_train_scale, y_train)
print(cv.best_params_)
print(cv.best_score_)

{'n_neighbors': 22}
0.8193508114856428


In [16]:
pre = cv.best_estimator_.predict(x_test_scale)
result_df = pd.DataFrame({
    "PassengerId":testid,
    "Survived":pre
})
result_df.to_csv("knn.csv", index=False, encoding="utf-8")
result_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
