In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [127]:
train_file_path = "train.csv"
train_data = pd.read_csv(train_file_path)

## Data Analysis

In [128]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [129]:
survived = train_data["Survived"]
survived.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [130]:
alive = survived[survived==1].count()
dead = survived[survived==0].count()
print("Percentage of people alive: ", (alive/survived.count()) * 100)
print("Percentage of people dead: ", (dead/survived.count()) * 100)

Percentage of people alive:  38.38383838383838
Percentage of people dead:  61.61616161616161


In [131]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [132]:
train_data.drop("PassengerId", axis=1, inplace=True)
train_data.drop("Name", axis=1, inplace=True)
train_data.drop("Ticket", axis=1, inplace=True)
train_data.drop("Cabin", axis=1, inplace=True)

In [133]:
le = LabelEncoder()
le.fit(train_data["Sex"])
train_data["Sex"] = le.transform(train_data["Sex"])
train_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.2500,S
1,1,1,0,38.0,1,0,71.2833,C
2,1,3,0,26.0,0,0,7.9250,S
3,1,1,0,35.0,1,0,53.1000,S
4,0,3,1,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,S
887,1,1,0,19.0,0,0,30.0000,S
888,0,3,0,,1,2,23.4500,S
889,1,1,1,26.0,0,0,30.0000,C


In [134]:
le = LabelEncoder()
le.fit(train_data["Embarked"])
train_data["Embarked"] = le.transform(train_data["Embarked"])
train_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.2500,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.9250,2
3,1,1,0,35.0,1,0,53.1000,2
4,0,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,2
887,1,1,0,19.0,0,0,30.0000,2
888,0,3,0,,1,2,23.4500,2
889,1,1,1,26.0,0,0,30.0000,0


In [135]:
features = list(train_data.columns)
features

['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [136]:
null_cols = train_data.isnull().any()
null_cols[null_cols == True]

Age    True
dtype: bool

In [148]:
train_data["Age"].fillna(train_data["Age"].mean(), inplace=True)

In [137]:
analysis = train_data.groupby("Survived")
for column in features:
    count = analysis[column].value_counts()
    print(count)

Survived  Survived
0         0           549
1         1           342
Name: Survived, dtype: int64
Survived  Pclass
0         3         372
          2          97
          1          80
1         1         136
          3         119
          2          87
Name: Pclass, dtype: int64
Survived  Sex
0         1      468
          0       81
1         0      233
          1      109
Name: Sex, dtype: int64
Survived  Age 
0         21.0    19
          28.0    18
          18.0    17
          25.0    17
          19.0    16
                  ..
1         43.0     1
          47.0     1
          53.0     1
          55.0     1
          80.0     1
Name: Age, Length: 142, dtype: int64
Survived  SibSp
0         0        398
          1         97
          2         15
          4         15
          3         12
          8          7
          5          5
1         0        210
          1        112
          2         13
          3          4
          4          3
Name: SibSp, dt

In [146]:
selected_features = ["Pclass", "Sex", "Age", "Embarked"]

In [141]:
X = train_data[selected_features]
y = train_data.Survived

In [143]:
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.7, random_state=0)

In [145]:
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier(random_state = 0)
model1.fit(train_X, train_y)

y_pred = model1.predict(test_X)
print("RandomForestClassifier Accuracy: ", accuracy_score(test_y, y_pred))

RandomForestClassifier Accuracy:  0.8097014925373134
