In [1]:
import pandas as pd

titanic_survival = pd.read_csv("titanic_survival.csv")

In [2]:
age = titanic_survival["age"]
print(age.loc[10:25])

10    47.0
11    18.0
12    24.0
13    26.0
14    80.0
15     NaN
16    24.0
17    50.0
18    32.0
19    36.0
20    37.0
21    47.0
22    26.0
23    42.0
24    29.0
25    25.0
Name: age, dtype: float64


In [3]:
age_is_null = pd.isnull(age)
age_null = age[age_is_null]
age_null_count = len(age_null)
print(age_null_count)

264


In [4]:
print(age_null)

15     NaN
37     NaN
40     NaN
46     NaN
59     NaN
69     NaN
70     NaN
74     NaN
80     NaN
106    NaN
107    NaN
108    NaN
118    NaN
121    NaN
125    NaN
134    NaN
147    NaN
152    NaN
157    NaN
166    NaN
176    NaN
179    NaN
184    NaN
196    NaN
204    NaN
219    NaN
223    NaN
235    NaN
237    NaN
241    NaN
        ..
1212   NaN
1213   NaN
1214   NaN
1215   NaN
1216   NaN
1219   NaN
1221   NaN
1241   NaN
1242   NaN
1243   NaN
1245   NaN
1246   NaN
1247   NaN
1249   NaN
1250   NaN
1253   NaN
1255   NaN
1262   NaN
1268   NaN
1282   NaN
1283   NaN
1284   NaN
1291   NaN
1292   NaN
1293   NaN
1297   NaN
1302   NaN
1303   NaN
1305   NaN
1309   NaN
Name: age, Length: 264, dtype: float64


In [5]:
age_is_null = pd.isnull(titanic_survival["age"])
good_ages = titanic_survival["age"][age_is_null == False]
mean_age = sum(good_ages) / len(good_ages)
print(mean_age)

29.8811345124283


In [6]:
mean_fare = titanic_survival["fare"].mean()
print(mean_fare)

33.29547928134572


In [7]:
fares_by_class = {}
passenger_classes = [1,2,3]

for this_class in passenger_classes:
    pclass_rows = titanic_survival[titanic_survival["pclass"] == this_class]
    pclass_fares = pclass_rows["fare"]
    fare_for_class = pclass_fares.mean()
    fares_by_class[this_class] = fare_for_class
    
print(fares_by_class)

{1: 87.50899164086687, 2: 21.1791963898917, 3: 13.302888700564957}


In [8]:
passenger_age = titanic_survival.pivot_table(index="pclass", values="age")
print(passenger_age)

              age
pclass           
1.0     39.159918
2.0     29.506705
3.0     24.816367


In [9]:
passenger_survival = titanic_survival.pivot_table(index="pclass", values="survived")
print(passenger_survival)

        survived
pclass          
1.0     0.619195
2.0     0.429603
3.0     0.255289


In [10]:
import numpy as np

port_stats = titanic_survival.pivot_table(index="embarked", values=["fare","survived"], aggfunc=np.sum)
print(port_stats)

                fare  survived
embarked                      
C         16830.7922     150.0
Q          1526.3085      44.0
S         25033.3862     304.0


In [11]:
new_titanic_survival = titanic_survival.dropna(axis=0, subset=["age", "sex"])
new_titanic_survival

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0000,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1.0,2.0,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
5,1.0,1.0,"Anderson, Mr. Harry",male,48.0000,0.0,0.0,19952,26.5500,E12,S,3,,"New York, NY"
6,1.0,1.0,"Andrews, Miss. Kornelia Theodosia",female,63.0000,1.0,0.0,13502,77.9583,D7,S,10,,"Hudson, NY"
7,1.0,0.0,"Andrews, Mr. Thomas Jr",male,39.0000,0.0,0.0,112050,0.0000,A36,S,,,"Belfast, NI"
8,1.0,1.0,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0000,2.0,0.0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY"
9,1.0,0.0,"Artagaveytia, Mr. Ramon",male,71.0000,0.0,0.0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"


In [12]:
print(new_titanic_survival.shape)

(1046, 14)


In [13]:
first_ten_rows = new_titanic_survival.iloc[0:10]
row_position_fifth = new_titanic_survival.iloc[4]
row_index_25 = new_titanic_survival.loc[25]

In [14]:
row_index_1100_age = new_titanic_survival.loc[1100,"age"]
row_index_25_survived = new_titanic_survival.loc[25,"survived"]
five_rows_three_cols = new_titanic_survival.iloc[0:5,0:3]

In [15]:
print(row_index_25_survived)

0.0


In [16]:
print(five_rows_three_cols)

   pclass  survived                                             name
0     1.0       1.0                    Allen, Miss. Elisabeth Walton
1     1.0       1.0                   Allison, Master. Hudson Trevor
2     1.0       0.0                     Allison, Miss. Helen Loraine
3     1.0       0.0             Allison, Mr. Hudson Joshua Creighton
4     1.0       0.0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)


In [17]:
titanic_reindexed = new_titanic_survival.reset_index(drop=True)
titanic_reindexed.iloc[0:5,0:3]

Unnamed: 0,pclass,survived,name
0,1.0,1.0,"Allen, Miss. Elisabeth Walton"
1,1.0,1.0,"Allison, Master. Hudson Trevor"
2,1.0,0.0,"Allison, Miss. Helen Loraine"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)"


In [18]:
def null_count(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)

In [19]:
column_null_count = titanic_survival.apply(null_count)
print(column_null_count)

pclass          1
survived        1
name            1
sex             1
age           264
sibsp           1
parch           1
ticket          1
fare            2
cabin        1015
embarked        3
boat          824
body         1189
home.dest     565
dtype: int64


In [20]:
def generate_age_label(row):
    age = row["age"]
    
    if pd.isnull(age):
        return "Unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"

In [21]:
age_labels = titanic_survival.apply(generate_age_label, axis=1)
age_labels

0         adult
1         minor
2         minor
3         adult
4         adult
5         adult
6         adult
7         adult
8         adult
9         adult
10        adult
11        adult
12        adult
13        adult
14        adult
15      Unknown
16        adult
17        adult
18        adult
19        adult
20        adult
21        adult
22        adult
23        adult
24        adult
25        adult
26        adult
27        adult
28        adult
29        adult
         ...   
1280      adult
1281      adult
1282    Unknown
1283    Unknown
1284    Unknown
1285      adult
1286      adult
1287      adult
1288      adult
1289      adult
1290      adult
1291    Unknown
1292    Unknown
1293    Unknown
1294      adult
1295      adult
1296      adult
1297    Unknown
1298      adult
1299      adult
1300      minor
1301      adult
1302    Unknown
1303    Unknown
1304      minor
1305    Unknown
1306      adult
1307      adult
1308      adult
1309    Unknown
Length: 1310, dtype: obj

In [22]:
titanic_survival["age_labels"] = age_labels

age_group_survival = titanic_survival.pivot_table(index="age_labels", values="survived")

print(age_group_survival)

            survived
age_labels          
Unknown     0.277567
adult       0.387892
minor       0.525974
