    Procedure : (for RandomForestRegressor)

    1. Read Data
         - sampling (train_set & test_set)
         - Numerical Data & Categorical Data & Labels
         - Filter and Capture the Data
         
    2. Numerical Data Processing
         - (Remove Duplicate Data)
         - Missing Value Imputation (MVI)
         - (Standard Scaler)
         
    3. Categorical Data Processing
         - Missing Value Imputation (MVI)
         - Encode Labels

    4. Training Model
         - for the imputer (Age、Embarked、Fare、Cabin)
         - for the label (Survived)
    5. 
    6. ...
    


In [159]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn import preprocessing, linear_model

from sklearn.model_selection import StratifiedShuffleSplit    # 分層抽樣
from sklearn.impute import SimpleImputer                      # 處理缺漏值

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder              # 分類屬性
from sklearn.preprocessing import OneHotEncoder               # 建立二元屬性
from sklearn.preprocessing import StandardScaler              # 標準化

from sklearn.linear_model import LinearRegression
# from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor            # 隨機森林

# from sklearn.model_selection import cross_val_score           # K-fold 交叉驗證 (回傳評估分數)
# from sklearn.model_selection import cross_val_predict         # K-fold 交叉驗證 (回傳各個測試的預測)
from sklearn.model_selection import StratifiedKFold           # K-fold 交叉驗證 (分層採樣)
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import confusion_matrix                  # 混淆矩陣
from sklearn.metrics import accuracy_score                    # 準確率[(TN+TP)/(TN+TP+FN+FP)]
from sklearn.metrics import classification_report             # precision & recall 報告
from sklearn.metrics import precision_score, recall_score     # precision & recall score
from sklearn.metrics import f1_score                          # precision & recall score 結合分數
from sklearn.metrics import precision_recall_curve            # precision & recall 取捨評估
from sklearn.metrics import roc_curve                         # ROC 曲線
from sklearn.metrics import roc_auc_score                     # ROC AUC (面積)

from sklearn.model_selection import GridSearchCV              # 網格搜尋
from sklearn.model_selection import RandomizedSearchCV        # 隨機搜尋
# 分析最佳模型與它們的誤差 → xxxCV.best_estimator_.feature_importances_

from sklearn.linear_model import SGDClassifier                # 二元分類器
from sklearn.ensemble import RandomForestClassifier           # 隨機森林分類器
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

In [160]:
tita_train = pd.read_csv(r'Data\train.csv')
tita_test = pd.read_csv(r'Data\test.csv')

train_miss = [m for m in tita_train.columns if tita_train[m].isnull().any(axis=0)]
print(pd.isnull(tita_train[train_miss]).sum(), '\n')

test_miss = [m for m in tita_test.columns if tita_test[m].isnull().any(axis=0)]
print(pd.isnull(tita_test[test_miss]).sum(), '\n')

titanic = tita_train.append(tita_test)
titanic.info()
titanic.describe()

Age         177
Cabin       687
Embarked      2
dtype: int64 

Age       86
Fare       1
Cabin    327
dtype: int64 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


### Data Processing

In [161]:
# (Data Conversion)

label = LabelEncoder()

# <Sex>
titanic['Sex_lab'] = label.fit_transform(titanic['Sex'])

# <Ticket>: 有無重複票根
ticket_count = titanic.Ticket.value_counts()
titanic['Ticket_IO'] = [1 if ticket_count.loc[t] >= 2 else 0 for t in titanic['Ticket']]

# <SibSp & Parch>: 有無家人
titanic['Family_IO'] = titanic.SibSp + titanic.Parch
titanic['Family_IO'] = [0 if p ==0 else 1 for p in titanic.Family_IO]

# <Name>
titanic['Title'] = titanic.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
titanic['Title'] = titanic['Title'].replace(['Capt', 'Col', 'Countess', 'Don', 'Dr', 'Jonkheer', 'Lady', 'Major', 
                                             'Master', 'Miss', 'Mlle', 'Mme', 'Mr', 'Mrs', 'Ms', 'Rev', 'Sir'],
                                            ['Mr', 'Mr', 'Mrs', 'Mr', 'Dr', 'Mr', 'Mrs', 'Mr',
                                             'Master', 'Miss', 'Miss', 'Miss', 'Mr', 'Mrs', 'Mrs', 'Mr', 'Mr'])
titanic['Title'] = label.fit_transform(titanic['Title'])

tita_prepare = titanic[['Pclass', 'Title', 'Sex_lab','Family_IO', 'Ticket_IO']].copy()
tita_label = titanic['Survived'].copy()

'''
Feather  : ['Pclass', 'Title', 'Sex_lab','Family_IO', 'Ticket_IO']
Miss(any): ['Age_Bin', 'Fare_Bin', 'Cabin_n', 'Embarked_lab']

Label    : ['Survived']
'''

tita_prepare.head(5)
# tita_prepare.info()
# tita_prepare.describe()

Unnamed: 0,Pclass,Title,Sex_lab,Family_IO,Ticket_IO
0,3,4,1,1,0
1,1,5,0,1,1
2,3,3,0,0,0
3,1,5,0,1,1
4,3,4,1,0,0


In [162]:
# (Missing Value Imputation (MVI): Embarked, Fare)

# <Embarked>
titanic.Embarked.fillna('S')
tita_prepare['Embarked_lab'] = label.fit_transform(titanic['Embarked'])

# <Fare>
titanic.Fare.fillna(titanic.Fare.median())
titanic['Fare_Bin'] = pd.qcut(titanic.Fare, 5)
tita_prepare['Fare_Bin'] = label.fit_transform(titanic['Fare_Bin'])

# titanic[['Embarked_lab', 'Fare_Bin', 'Age_Bin', 'Cabin_n']].head()
tita_prepare.head()

Unnamed: 0,Pclass,Title,Sex_lab,Family_IO,Ticket_IO,Embarked_lab,Fare_Bin
0,3,4,1,1,0,2,0
1,1,5,0,1,1,0,4
2,3,3,0,0,0,2,1
3,1,5,0,1,1,2,4
4,3,4,1,0,0,2,1


In [163]:
# (Missing Value Imputation (MVI): Age)

tita_prepare['Age_Bin'] = pd.qcut(titanic.Age, 10)
mask = pd.isnull(tita_prepare.Age_Bin)

age_train = tita_prepare.dropna(subset=['Age_Bin'], how='any')
age_feature = age_train.drop('Age_Bin', axis=1)
age_label = label.fit_transform(age_train.Age_Bin)

# (training)
forest_clf_age = RandomForestClassifier(random_state=42)
forest_clf_age.fit(age_feature, age_label)
preds_age = forest_clf_age.predict(age_feature)

print(pd.crosstab(preds_age, age_label), '\n')
print('正確率 :', forest_clf_age.score(age_feature, age_label), '\n')

# (prepare the feature for the prediction)
age_preds = tita_prepare[tita_prepare['Age_Bin'].isnull()]
age_pred_feature = age_preds.drop('Age_Bin', axis=1)
preds_age = forest_clf_age.predict(age_pred_feature)

# (利用 model 預測遺失的 Age_Bin)
tita_prepare.Age_Bin = label.fit_transform(tita_prepare['Age_Bin'])
preds_age_nan = forest_clf_age.predict(tita_prepare[mask].drop('Age_Bin', axis=1))


# (將 Encoded 的 Nan 換成訓練出來的 labels)
n = 0
for s in range(len(tita_prepare)):
    if tita_prepare.Age_Bin.iloc[s] == 10:
        tita_prepare.Age_Bin.iloc[s] = preds_age_nan[n]
        n += 1

tita_prepare.head(8)


col_0   0   1   2   3   4   5   6   7   8   9
row_0                                        
0      94   7   4   1   0   0   0   0   0   0
1       5  32   8   7   3   6   2   3   1   1
2       2  38  71  41  29  25  32  17  14  11
3       1  11   9  28  15  12  11  10   6   7
4       1   3   2   1  12   0   4   5   1   1
5       1   2   4   2   3  21   7   5   1   2
6       1   8   3  14  13  19  38  12  13   5
7       2   5   2   3   4   5   6  23   7   3
8       0   5   2   4   5   0   4   3  26   8
9       0   7   3   9   9  10  17  18  31  57 

正確率 : 0.384321223709369 



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,Pclass,Title,Sex_lab,Family_IO,Ticket_IO,Embarked_lab,Fare_Bin,Age_Bin
0,3,4,1,1,0,2,0,2
1,1,5,0,1,1,0,4,7
2,3,3,0,0,0,2,1,4
3,1,5,0,1,1,2,4,6
4,3,4,1,0,0,2,1,6
5,3,4,1,0,0,1,1,4
6,1,4,1,0,1,2,4,9
7,3,2,1,1,1,2,2,0


In [164]:
# (Missing Value Imputation (MVI): Cabin)

tita_prepare['Cabin_n'] = [e[0] if not pd.isnull(e) else e for e in titanic.Cabin]
mask2 = pd.isnull(tita_prepare.Cabin_n)

# (prepare the features and labels for training)
cabin_train = tita_prepare.dropna(subset=['Cabin_n'], how='any')
cabin_feature = cabin_train.drop('Cabin_n', axis=1)
cabin_label = label.fit_transform(cabin_train.Cabin_n)

# (training)
forest_clf_cab = RandomForestClassifier(random_state=42)
forest_clf_cab.fit(cabin_feature, cabin_label)
preds_cab = forest_clf_cab.predict(cabin_feature)

print(pd.crosstab(preds_cab, cabin_label), '\n')
print('正確率 :', forest_clf_cab.score(cabin_feature, cabin_label), '\n')

# (prepare the features for the prediction)
cab_preds = tita_prepare[tita_prepare['Cabin_n'].isnull()]
cab_pred_feature = cab_preds.drop('Cabin_n', axis=1)
preds_cab = forest_clf_cab.predict(cab_pred_feature)

# (利用 model 預測遺失的 Cabin_n)
tita_prepare.Cabin_n = label.fit_transform(tita_prepare['Cabin_n'])
preds_cab_nan = forest_clf_cab.predict(tita_prepare[mask2].drop('Cabin_n', axis=1))

# (將 Encoded 的 Nan 換成訓練出來的 labels)
n = 0
for s in range(len(tita_prepare)):
    if tita_prepare.Cabin_n.iloc[s] == 8:
        tita_prepare.Cabin_n.iloc[s] = preds_cab_nan[n]
        n += 1

tita_prepare.head(8)

col_0   0   1   2   3   4   5  6  7
row_0                              
0      14   2   3   0   0   0  0  0
1       1  49  16   5   2   0  0  0
2       6  10  70  16  10   0  0  0
3       0   2   1  23   2   0  0  0
4       1   2   4   2  27   0  0  1
5       0   0   0   0   0  21  0  0
6       0   0   0   0   0   0  5  0 

正確率 : 0.7084745762711865 



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,Pclass,Title,Sex_lab,Family_IO,Ticket_IO,Embarked_lab,Fare_Bin,Age_Bin,Cabin_n
0,3,4,1,1,0,2,0,2,5
1,1,5,0,1,1,0,4,7,2
2,3,3,0,0,0,2,1,4,4
3,1,5,0,1,1,2,4,6,2
4,3,4,1,0,0,2,1,6,4
5,3,4,1,0,0,1,1,4,5
6,1,4,1,0,1,2,4,9,4
7,3,2,1,1,1,2,2,0,4


In [165]:
tita_prepare.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   Pclass        1309 non-null   int64
 1   Title         1309 non-null   int32
 2   Sex_lab       1309 non-null   int32
 3   Family_IO     1309 non-null   int64
 4   Ticket_IO     1309 non-null   int64
 5   Embarked_lab  1309 non-null   int32
 6   Fare_Bin      1309 non-null   int32
 7   Age_Bin       1309 non-null   int32
 8   Cabin_n       1309 non-null   int32
dtypes: int32(6), int64(3)
memory usage: 71.6 KB


### Training

In [169]:
train_feature = tita_prepare.iloc[:891]
train_label = label.fit_transform(titanic.Survived.iloc[:891])

forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(train_feature, train_label)
preds = forest_clf.predict(train_feature)

print(pd.crosstab(preds, train_label), '\n')
print('正確率 :', forest_clf.score(train_feature, train_label), '\n')


col_0    0    1
row_0          
0      525   61
1       24  281 

正確率 : 0.9046015712682379 



In [170]:
# (利用 model 預測 Survived)

test_feature = tita_prepare.iloc[891:]
preds_survived = forest_clf.predict(test_feature)

submission = pd.DataFrame({'PassengerId':tita_test.PassengerId, 
                           'Survived':preds_survived})

submission.to_csv(r'Data\submission_v3.0_.csv', index=False)

submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [None]:
# (分層抽樣)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(titanic, titanic['Sex']):
    strat_train_set = titanic.iloc[train_index]
    strat_test_set = titanic.iloc[test_index]