In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

data = pd.read_csv(r"data\train.csv", header=0,delimiter=",", quoting=1)
# train, val = train_test_split(data)
test = pd.read_csv(r"data\test.csv", header=0,delimiter=",", quoting=1)

In [40]:
data.describe(include=["O"])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [11]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [12]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [14]:
survived = data[data["Survived"]==1]
not_survived = data[data["Survived"]==0]
print(len(survived),len(not_survived),len(data))

342 549 891


In [17]:
data.groupby("Pclass").Survived.value_counts()

Pclass  Survived
1       1           136
        0            80
2       0            97
        1            87
3       0           372
        1           119
Name: Survived, dtype: int64

In [26]:
data[["Sex","Survived"]].groupby(["Sex"],as_index=False).mean()


Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [28]:
pd.crosstab(data["Sex"],[data["Pclass"],data["Survived"]])

Pclass,1,1,2,2,3,3
Survived,0,1,0,1,0,1
Sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,3,91,6,70,72,72
male,77,45,91,17,300,47


In [31]:
data[["Embarked","Survived"]].groupby("Embarked").mean()

Unnamed: 0_level_0,Survived
Embarked,Unnamed: 1_level_1
C,0.553571
Q,0.38961
S,0.336957


In [38]:
data.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [35]:
data.groupby("Parch").Survived.mean()

Parch
0    0.343658
1    0.550847
2    0.500000
3    0.600000
4    0.000000
5    0.200000
6    0.000000
Name: Survived, dtype: float64

In [37]:
data.groupby("SibSp").Survived.value_counts()

SibSp  Survived
0      0           398
       1           210
1      1           112
       0            97
2      0            15
       1            13
3      0            12
       1             4
4      0            15
       1             3
5      0             5
8      0             7
Name: Survived, dtype: int64

In [42]:
data.SibSp.describe(include="all")

count    891.000000
mean       0.523008
std        1.102743
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        8.000000
Name: SibSp, dtype: float64

In [None]:
train_test_data = [data, test]
for dataset in train_test_data:
    dataset["Sex"] = dataset["Sex"].map({"female":True, "male":False})
    dataset["Embarked"] = dataset.Embarked.fillna("S")
    dataset["Embarked"] = dataset["Embarked"].map({"C":1,"Q":2,"S":3}).astype(int)
    

In [5]:
output = pd.DataFrame( data={"PassengerId":test["PassengerId"], "Survived":test["Sex"].apply(lambda x: 0 if x=="male" else 1)} )
output.to_csv("Output\\"+"gender.csv", index=False, quoting=3)

In [8]:
feats = ["Age","Sex","Parch","Pclass","SibSp","Embarked"]
d = {"male": False, "female": True, np.NaN:0, "C":1, "S":2, "Q":3}
train_y = train["Survived"]
train_x = train[feats].replace(d)
test_x = test[feats].replace(d)

In [19]:
x, val_x, y, val_y = train_test_split(train_x,train_y)
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
candidate_estimators = [5,10,25,50,100,250]
t_b = np.inf
b_e = 0
for est in candidate_estimators:
    m_n = 0
    m_score = np.inf
    scores = []
    for nodes in candidate_max_leaf_nodes:
        forest = RandomForestClassifier(max_leaf_nodes=nodes, n_estimators=est)
        forest.fit(x,y)
        score = forest.score(val_x,val_y)
        scores.append(score)
        if score<m_score:
            m_n = nodes
            m_score = score
    if m_score<t_b:
        t_b = m_score
        b_e = est
    print(est," estimators, ",m_n," nodes with score: ", m_score)
    print(scores)
print("Best config: ",b_e)

5  estimators,  5  nodes with score:  0.7757847533632287
[0.7757847533632287, 0.820627802690583, 0.8116591928251121, 0.820627802690583, 0.8026905829596412, 0.8026905829596412]
10  estimators,  100  nodes with score:  0.8026905829596412
[0.8161434977578476, 0.8251121076233184, 0.8251121076233184, 0.8026905829596412, 0.8116591928251121, 0.8161434977578476]
25  estimators,  250  nodes with score:  0.7892376681614349
[0.7982062780269058, 0.8071748878923767, 0.8251121076233184, 0.820627802690583, 0.7892376681614349, 0.8116591928251121]
50  estimators,  5  nodes with score:  0.8026905829596412
[0.8026905829596412, 0.8161434977578476, 0.8161434977578476, 0.820627802690583, 0.8161434977578476, 0.8116591928251121]
100  estimators,  5  nodes with score:  0.7937219730941704
[0.7937219730941704, 0.8251121076233184, 0.820627802690583, 0.8251121076233184, 0.820627802690583, 0.820627802690583]
250  estimators,  5  nodes with score:  0.8026905829596412
[0.8026905829596412, 0.8161434977578476, 0.829596

In [20]:
forest = RandomForestClassifier(max_leaf_nodes=5, n_estimators=5)
forest.fit(train_x,train_y)

RandomForestClassifier(max_leaf_nodes=5, n_estimators=5)

In [91]:
forest.fit(train_x,train_y)
pred_val = forest.predict(val_x)
pred_train = forest.predict(train_x)

In [92]:
from sklearn.metrics import log_loss
val_error = log_loss(val_y,pred_val)
train_error = log_loss(train_y,pred_train)
print("ValError:",val_error,"\nTrainError:",train_error)

ValError: 5.266026881654596 
TrainError: 5.429038784701824


In [21]:
result = forest.predict(test_x)

In [22]:
output = pd.DataFrame( data={"PassengerId":test["PassengerId"], "Survived":result} )
output.to_csv("Output\\"+"RF_5_5.csv", index=False, quoting=3)