In [4]:
# to make the .py script runnable
#!/usr/bin/env python

In [5]:
from sklearn import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.style.use('ggplot')

In [6]:
import os

In [7]:
pd.DataFrame?

# 2.4 DataFrames

Dataframes can be considered as a group of Series, sharing an index. This makes it a 2D structure, not unlike an R dataframe or a funky Excel sheet. They are the basic structure in which we will store data.

Each column of a dataframe is, in itself a Serie. Each column can therefore only contain objects of one fixed type (i.e. a column of integers). The title of the Serie is the name of the collumn. As such, all functionality of Pandas Series will also work in Pandas DatafFames

### 2.4.1 Loading a DataFrame and looking around

We will load and take a look in a DataFrame that contains (real!) info on the passengers of the titanic

In [8]:
df_titanic = pd.read_csv('data/titanic.csv')

In [9]:
df_titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


You can clearly see the 2D structure, with column names at the top and an index (shared by all collumns) to the left. 

The titanic data entails 12 recorded variables. 
- PassengerID: the passengernumber
- survival: Survival(0 = No; 1 = Yes)            
- pclass: Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
- name: Name
- sex: Sex
- age: Age
- sibsp: Number of Siblings/Spouses Aboard
- parch: Number of Parents/Children Aboard
- ticket: Ticket Number
- fare: Passenger Fare
- cabin: Cabin
- embarked:  Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)



This DataFrame has a managable size, but often, you want to take a peek and summarize without printing the whole object

In [10]:
df_titanic.head(5) #the first 5 rows

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
df_titanic.tail(5) #the last 5 rows

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [12]:
df_titanic.sample(5) # 5 random rows

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
379,380,0,3,"Gustafsson, Mr. Karl Gideon",male,19.0,0,0,347069,7.775,,S
435,436,1,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0,B96 B98,S
855,856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18.0,0,1,392091,9.35,,S
676,677,0,3,"Sawyer, Mr. Frederick Charles",male,24.5,0,0,342826,8.05,,S
319,320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corn...",female,40.0,1,1,16966,134.5,E34,C


In [13]:
df_titanic.shape # the dimensions, 891 rows, 12 columns

(891, 12)

In [14]:
df_titanic.size # number of cells = number of rows * number of columns

10692

In [15]:
df_titanic.columns # the name of the columns. Note that this is an 'index' not a list

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [16]:
df_titanic.index # the used shared index, i.e.,the rownames

RangeIndex(start=0, stop=891, step=1)

In [17]:
df_titanic.info() # a summary of the above (dimension, columnsnames, size...). 
#Note that each column contains objects of one type

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [18]:
df_titanic.describe() # some summary descriptives. 
#Note that these are only calculated for columns with objects of a numerical type

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## 2.4.2 Columns and index

In [19]:
# a column is a Series
print('The type of a column is: '+str(type(df_titanic.Age)))
# It is also an attribute of the DataFrame
df_titanic.Age

The type of a column is: <class 'pandas.core.series.Series'>


0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
5       NaN
6      54.0
7       2.0
8      27.0
9      14.0
10      4.0
11     58.0
12     20.0
13     39.0
14     14.0
15     55.0
16      2.0
17      NaN
18     31.0
19      NaN
20     35.0
21     34.0
22     15.0
23     28.0
24      8.0
25     38.0
26      NaN
27     19.0
28      NaN
29      NaN
       ... 
861    21.0
862    48.0
863     NaN
864    24.0
865    42.0
866    27.0
867    31.0
868     NaN
869     4.0
870    26.0
871    47.0
872    33.0
873    47.0
874    28.0
875    15.0
876    20.0
877    19.0
878     NaN
879    56.0
880    25.0
881    33.0
882    22.0
883    28.0
884    25.0
885    39.0
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

The columnnames and index are of the 'index'type. It is immutable (you cannot change an object, kind of like a tuple). To change a columnname (or index), you need to assign a new list as the columnsnames. For an index, you can also pick a column.

In [20]:
print(df_titanic.columns[6])
new_col_names=list(df_titanic.columns)
new_col_names[6]='SiblingSpouses'
df_titanic.columns=new_col_names
df_titanic.columns

SibSp


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age',
       'SiblingSpouses', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [21]:
df_titanic.set_index('Name') # The inplace = True is needed to permanently change the index.

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Sex,Age,SiblingSpouses,Parch,Ticket,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.2500,,S
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,female,35.0,1,0,113803,53.1000,C123,S
"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.0500,,S
"Moran, Mr. James",6,0,3,male,,0,0,330877,8.4583,,Q
"McCarthy, Mr. Timothy J",7,0,1,male,54.0,0,0,17463,51.8625,E46,S
"Palsson, Master. Gosta Leonard",8,0,3,male,2.0,3,1,349909,21.0750,,S
"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",9,1,3,female,27.0,0,2,347742,11.1333,,S
"Nasser, Mrs. Nicholas (Adele Achem)",10,1,2,female,14.0,1,0,237736,30.0708,,C


The index is identical to the 'PassengerId' column, so we can use the 'PassengerId' column as index.

In [22]:
df_titanic.set_index('PassengerId',inplace=True) # The inplace = True is needed to permanently change the index.

In [23]:
df_titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SiblingSpouses,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 2.4.3 Subsetting

There are numerous ways of subsetting a DataFrame. The easiest one is - I guess - the following:

In [24]:
df_titanic[:4]

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SiblingSpouses,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [25]:
df_titanic['Ticket']

PassengerId
1             A/5 21171
2              PC 17599
3      STON/O2. 3101282
4                113803
5                373450
6                330877
7                 17463
8                349909
9                347742
10               237736
11              PP 9549
12               113783
13            A/5. 2151
14               347082
15               350406
16               248706
17               382652
18               244373
19               345763
20                 2649
21               239865
22               248698
23               330923
24               113788
25               349909
26               347077
27                 2631
28                19950
29               330959
30               349216
             ...       
862               28134
863               17466
864            CA. 2343
865              233866
866              236852
867       SC/PARIS 2149
868            PC 17590
869              345777
870              347742
871              349248
872 

In [26]:
df_titanic['Ticket'][5]

'373450'

But we advise not to use this, as it can give unpredictable results by confusing the index and the position.

The clearest subsetting system is the use of (as with Series)

- *.loc[ ]* for using 'named' (index/columns/conditional/...) subsetting

- *.iloc[ ]* for using positional (i for integer) subsetting

There are other ways (e.g., .query), but we will stick with *loc* and *iloc* for consistency sake.

### 2.4.3.1 .iloc - positional indexing

In [27]:
df_titanic.iloc[5,2] #the fifth row (starting from 0) and second column (starting from zero)

'Moran, Mr. James'

In [28]:
print(type(df_titanic.iloc[5])) #the fifth row (starting from 0). It is a Series
df_titanic.iloc[5]

<class 'pandas.core.series.Series'>


Survived                         0
Pclass                           3
Name              Moran, Mr. James
Sex                           male
Age                            NaN
SiblingSpouses                   0
Parch                            0
Ticket                      330877
Fare                        8.4583
Cabin                          NaN
Embarked                         Q
Name: 6, dtype: object

In [29]:
df_titanic.iloc[3:9,0:5]

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0
5,0,3,"Allen, Mr. William Henry",male,35.0
6,0,3,"Moran, Mr. James",male,
7,0,1,"McCarthy, Mr. Timothy J",male,54.0
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0


In [30]:
df_titanic.iloc[[3,6,9],[1,2,7]]

Unnamed: 0_level_0,Pclass,Name,Ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",113803
7,1,"McCarthy, Mr. Timothy J",17463
10,2,"Nasser, Mrs. Nicholas (Adele Achem)",237736


In [31]:
df_titanic.iloc[:,5] #select one column, not specifying the rows. It becomes a Series

PassengerId
1      1
2      1
3      0
4      1
5      0
6      0
7      0
8      3
9      0
10     1
11     1
12     0
13     0
14     1
15     0
16     0
17     4
18     0
19     1
20     0
21     0
22     0
23     0
24     0
25     3
26     1
27     0
28     3
29     0
30     0
      ..
862    1
863    0
864    8
865    0
866    0
867    1
868    0
869    0
870    1
871    0
872    1
873    0
874    0
875    1
876    0
877    0
878    0
879    0
880    0
881    0
882    0
883    0
884    0
885    0
886    0
887    0
888    0
889    1
890    0
891    0
Name: SiblingSpouses, Length: 891, dtype: int64

In [32]:
df_titanic.iloc[:,[5]].head(4) #select one column, by giving a list of one, not specifying the rows. 
# It is still a DataFrame, with DataFrame methods.

Unnamed: 0_level_0,SiblingSpouses
PassengerId,Unnamed: 1_level_1
1,1
2,1
3,0
4,1


As you can see a single integer gives a single column as a Series, while a list of integers (even a list with only one element) returns a DataFrame. The same logic applies to rows.

### 2.4.3.2 .loc - named indexing

In [33]:
df_titanic.loc[[1,2,5],:'Age'] #Note that this refers to the '1', '2' and '5' in the PassengerId indexcolumn. 
#To get the same rows with .iloc, the code would be .iloc[[0,1,4],:4]

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0
5,0,3,"Allen, Mr. William Henry",male,35.0


In [34]:
df_titanic.loc[4,:] # select the row with index 4 and all columns

Survived                                                     1
Pclass                                                       1
Name              Futrelle, Mrs. Jacques Heath (Lily May Peel)
Sex                                                     female
Age                                                         35
SiblingSpouses                                               1
Parch                                                        0
Ticket                                                  113803
Fare                                                      53.1
Cabin                                                     C123
Embarked                                                     S
Name: 4, dtype: object

In [35]:
df_titanic.loc[:,'Parch']# select the Parch columns and all rows

PassengerId
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      1
9      2
10     0
11     1
12     0
13     0
14     5
15     0
16     0
17     1
18     0
19     0
20     0
21     0
22     0
23     0
24     0
25     1
26     5
27     0
28     2
29     0
30     0
      ..
862    0
863    0
864    2
865    0
866    0
867    0
868    0
869    0
870    1
871    0
872    1
873    0
874    0
875    0
876    0
877    0
878    0
879    0
880    1
881    1
882    0
883    0
884    0
885    0
886    5
887    0
888    0
889    2
890    0
891    0
Name: Parch, Length: 891, dtype: int64

In [36]:
df_titanic.loc[[24,98,75],['Sex',"Age",'Parch']]

Unnamed: 0_level_0,Sex,Age,Parch
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
24,male,28.0,0
98,male,23.0,1
75,male,32.0,0


### 2.4.3.3 Conditional subsetting

Conditional subsetting also uses the 'loc' command.

In [37]:
df_titanic.loc[:,'Survived']==1

PassengerId
1      False
2       True
3       True
4       True
5      False
6      False
7      False
8      False
9       True
10      True
11      True
12      True
13     False
14     False
15     False
16      True
17     False
18      True
19     False
20      True
21     False
22      True
23      True
24      True
25     False
26      True
27     False
28     False
29      True
30     False
       ...  
862    False
863     True
864    False
865    False
866     True
867     True
868    False
869    False
870     True
871    False
872     True
873    False
874    False
875     True
876     True
877    False
878    False
879    False
880     True
881     True
882    False
883    False
884    False
885    False
886    False
887    False
888     True
889    False
890     True
891    False
Name: Survived, Length: 891, dtype: bool

In [38]:
df_titanic.loc[df_titanic.loc[:,'Survived']==1,:] #All the surviving passengers

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SiblingSpouses,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.00,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00,1,0,113803,53.1000,C123,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.00,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.00,1,0,237736,30.0708,,C
11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.00,1,1,PP 9549,16.7000,G6,S
12,1,1,"Bonnell, Miss. Elizabeth",female,58.00,0,0,113783,26.5500,C103,S
16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.00,0,0,248706,16.0000,,S
18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C


In [39]:
df_titanic.loc[((df_titanic.loc[:,'Parch']==1) | (df_titanic.loc[:,'Parch']==2)) &(df_titanic.loc[:,'Survived']==1) ,['Name', "Sex"]] 

#The name and sex of all the passengers that survived AND (&) were on Parch 1 OR (|) 2

Unnamed: 0_level_0,Name,Sex
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
9,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female
11,"Sandstrom, Miss. Marguerite Rut",female
44,"Laroche, Miss. Simonne Marie Anne Andree",female
59,"West, Miss. Constance Mirium",female
66,"Moubarek, Master. Gerios",male
69,"Andersson, Miss. Erna Alexandra",female
79,"Caldwell, Master. Alden Gates",male
89,"Fortune, Miss. Mabel Helen",female
98,"Greenfield, Mr. William Bertram",male
99,"Doling, Mrs. John T (Ada Julia Bone)",female


as you can see you just need a Serie/list/array with booleans to do subsetting

In [40]:
['Miss' in x for x in df_titanic.loc[:,'Name']]

[False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,

In [41]:
df_miss=df_titanic.loc[['Miss' in x for x in df_titanic.loc[:,'Name']],:]
df_miss.head(10) # the first 10 passengers with 'Miss' in their name

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SiblingSpouses,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S
15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,,S
23,1,3,"McGowan, Miss. Anna ""Annie""",female,15.0,0,0,330923,8.0292,,Q
25,0,3,"Palsson, Miss. Torborg Danira",female,8.0,3,1,349909,21.075,,S
29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
33,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.75,,Q
39,0,3,"Vander Planke, Miss. Augusta Maria",female,18.0,2,0,345764,18.0,,S
40,1,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,11.2417,,C


Again, there all multiple other syntaxes for these procudure, we stick here with only .loc and .iloc for consistency reasons.

## 2.4.4 Attributes, methods and functions

As all columns and rows are actually Series, all attributes, methods and functions of Series also work on DataFrames. The type of values of a Series is fixed (and form actually an Array), so type specific attributes can be applied. An example in human lingo, a DataFrame knows that a certain column is numerical, so numerical stuff can be calculated from the column.

There are numerous funcionalities contained in the DataFrame class, so the following are only a taste.

In [None]:
df_titanic.shape #the dimensions of the DataFrame

In [None]:
df_miss.loc[:,"Age"].mean() #the mean age of all passengers without 'miss' in their name

In [None]:
miss_names_list=list(df_miss.loc[:,"Name"]) # a list of all names without 'miss'
df_not_miss=df_titanic.loc[[x not in miss_names_list for x in df_titanic.loc[:,"Name"]]] # all passengers with their name not in the miss list
df_not_miss.head(10) #the first ten passenenger without miss in their name

In [None]:
df_not_miss.loc[:,"Age"].mean()-df_miss.loc[:,"Age"].mean() 
#people without miss in their name are on average 10 years older!

In [None]:
df_titanic.loc[:,"Pclass"].unique() #the possible values of Pclass

In [None]:
df_titanic.loc[:,"Pclass"].value_counts() # the number of people in a certain class

In [None]:
df_titanic.loc[:,"Pclass"].value_counts()/len(df_titanic.index) # the  percentage of people in a certain class

In [None]:
df_miss.loc[:,"Pclass"].value_counts()/len(df_miss.index) # the percentage of misses in a certain class

In [None]:
df_titanic.sort_values('Age') # the dataframe sorted by age. You can lock this position with an inplace = True

In [None]:
df_titanic.loc[-df_titanic.loc[:,'Cabin'].isnull(),:] # all passengers where the Cabin cell  is NOT (-) empty

The 'groupby' method groups according to a specified column and compares the different values on a chosen function.

In [None]:
df_titanic.groupby('Embarked')['Age'].mean() #compares the mean ages of all different embarkments

In [None]:
df_titanic.groupby('Survived')['Sex'].value_counts() #counts the number of males and females for survivors and non-survivors

In [None]:
df_miss.to_csv('all_the_misses') #save dataframe with the misses to csv!

## 2.4.5 Creating a DataFrame

When creating a DataFrame from scratch, you usually start from:
- A 2 dimensional Array (i.e. a matrix). You can specify the index and colums
- Dicts with lists of equal length as values.

### 2.4.5.1 from a 2D array

In [42]:
array_2d=np.arange(20, 32).reshape(3, 4)
array_2d

array([[20, 21, 22, 23],
       [24, 25, 26, 27],
       [28, 29, 30, 31]])

In [43]:
pd.DataFrame(array_2d) #without specified index and columns

Unnamed: 0,0,1,2,3
0,20,21,22,23
1,24,25,26,27
2,28,29,30,31


In [44]:
pd.DataFrame(array_2d,index=list('abc'),columns=['col_1','col_2','col_3','col_4'])

Unnamed: 0,col_1,col_2,col_3,col_4
a,20,21,22,23
b,24,25,26,27
c,28,29,30,31


### 2.4.5.2 from a dict

In [45]:
import numpy as np
import pandas as pd

In [46]:
dict_countries = {'country': ['Belgium', 'France', 'Germany', 'Netherlands', 'United Kingdom'],
        'population': [11.3, 64.3, 81.3, 16.9, 64.9],
        'area': [30510, 671308, 357050, 41526, 244820],
        'capital': ['Brussels', 'Paris', 'Berlin', 'Amsterdam', 'London']}

df_countries = pd.DataFrame(dict_countries, index=list('pqrst'))

df_countries

Unnamed: 0,country,population,area,capital
p,Belgium,11.3,30510,Brussels
q,France,64.3,671308,Paris
r,Germany,81.3,357050,Berlin
s,Netherlands,16.9,41526,Amsterdam
t,United Kingdom,64.9,244820,London


### 2.4.5.3 adding/dropping rows and columns

In [47]:
df_countries.loc['u',:]=["Wakanda",7,150000,'Birnin Zan']
df_countries

Unnamed: 0,country,population,area,capital
p,Belgium,11.3,30510.0,Brussels
q,France,64.3,671308.0,Paris
r,Germany,81.3,357050.0,Berlin
s,Netherlands,16.9,41526.0,Amsterdam
t,United Kingdom,64.9,244820.0,London
u,Wakanda,7.0,150000.0,Birnin Zan


In [48]:
df_countries.loc[:,'head of state']=['King','President','President','King','Queen','King/Black Panther']
df_countries

Unnamed: 0,country,population,area,capital,head of state
p,Belgium,11.3,30510.0,Brussels,King
q,France,64.3,671308.0,Paris,President
r,Germany,81.3,357050.0,Berlin,President
s,Netherlands,16.9,41526.0,Amsterdam,King
t,United Kingdom,64.9,244820.0,London,Queen
u,Wakanda,7.0,150000.0,Birnin Zan,King/Black Panther


In [49]:
df_countries.loc['u','name of regent']="T'Challa"
df_countries

Unnamed: 0,country,population,area,capital,head of state,name of regent
p,Belgium,11.3,30510.0,Brussels,King,
q,France,64.3,671308.0,Paris,President,
r,Germany,81.3,357050.0,Berlin,President,
s,Netherlands,16.9,41526.0,Amsterdam,King,
t,United Kingdom,64.9,244820.0,London,Queen,
u,Wakanda,7.0,150000.0,Birnin Zan,King/Black Panther,T'Challa


In [50]:
df_countries.drop('u') #by default the 'drop' function looks in the index. 
#With inplace=True, the drop is not permanent

Unnamed: 0,country,population,area,capital,head of state,name of regent
p,Belgium,11.3,30510.0,Brussels,King,
q,France,64.3,671308.0,Paris,President,
r,Germany,81.3,357050.0,Berlin,President,
s,Netherlands,16.9,41526.0,Amsterdam,King,
t,United Kingdom,64.9,244820.0,London,Queen,


In [51]:
df_countries.drop('name of regent',axis=1,inplace=True) #axis = 1  refers to the columns, axis = 0 would refer to the index
# With inplace=True, the drop is permanent
df_countries

Unnamed: 0,country,population,area,capital,head of state
p,Belgium,11.3,30510.0,Brussels,King
q,France,64.3,671308.0,Paris,President
r,Germany,81.3,357050.0,Berlin,President
s,Netherlands,16.9,41526.0,Amsterdam,King
t,United Kingdom,64.9,244820.0,London,Queen
u,Wakanda,7.0,150000.0,Birnin Zan,King/Black Panther


## Try!

Let's continu with the countries DataFrame!

- How many columns are there, how many rows?
- What are the names of the columns?
- What are the names of the rows?
- What type of data does each column contain?
- Do the previous tasks in one command
- Give the population of Germany, supposing you do not know its index `r`
- Produce a dataframe of the capital and area of all countries with population more than 50.

## Solution

In [52]:
df_countries.shape

(6, 5)

In [53]:
# Check row names
df_countries.index

Index(['p', 'q', 'r', 's', 't', 'u'], dtype='object')

In [54]:
# Check column names
df_countries.columns

Index(['country', 'population', 'area', 'capital', 'head of state'], dtype='object')

In [55]:
# To check the data types of the different columns:
df_countries.dtypes

country           object
population       float64
area             float64
capital           object
head of state     object
dtype: object

In [56]:
# Overview of the data
df_countries.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, p to u
Data columns (total 5 columns):
country          6 non-null object
population       6 non-null float64
area             6 non-null float64
capital          6 non-null object
head of state    6 non-null object
dtypes: float64(2), object(3)
memory usage: 448.0+ bytes


In [57]:
df_countries

Unnamed: 0,country,population,area,capital,head of state
p,Belgium,11.3,30510.0,Brussels,King
q,France,64.3,671308.0,Paris,President
r,Germany,81.3,357050.0,Berlin,President
s,Netherlands,16.9,41526.0,Amsterdam,King
t,United Kingdom,64.9,244820.0,London,Queen
u,Wakanda,7.0,150000.0,Birnin Zan,King/Black Panther


In [58]:
#population of Germany 1
df_countries.loc[df_countries.loc[:,'country'] == 'Germany', 'population']

r    81.3
Name: population, dtype: float64

In [59]:
#population of Germany 2
df_countries.set_index('country').loc['Germany', 'population']

81.3

In [18]:
#capital and area of countries with pop over 50
df_countries.loc[df_countries.loc[:,'population']>50, ['capital', 'area']]

Unnamed: 0,capital,area
q,Paris,671308.0
r,Berlin,357050.0
t,London,244820.0


## 2.4.6 Apply functions to element/rows or columns of a DataFrame

Using 

- **`s.map()`**, apply a func to each element of a Series
- **`df.applymap()`** apply a func to each element of a DF
- **`df.apply()`** apply a func to rows/columns of a DF

***Remember, `axis = 1` can be read as 'per column' `axis = 0` can be read as 'per row'.***

### 2.4.6.1 applymap

Applies a function to all cells of DataFrame.

In [None]:
arr_3 = np.random.randn(56).reshape(7, 8).round(2)

In [None]:
df_3 = pd.DataFrame(arr_3)

In [None]:
df_3

In [None]:
df_3.applymap(lambda x: 'neg' if x < 0 else 'pos')
# a function that checks if value is less than 0, then says 'neg' for negative or 'pos' for positive

In [None]:
# Named Function
def addTen(x):
    return x + 10

In [None]:
addTen(1)

In [None]:
df_3.applymap(lambda x: addTen(x))

### 2.4.6.2 apply 

Using the `.apply(func, axis=)` applies a function to the rows. Note that a function can return a single value (apply returns a Series) or a Series (apply returns a DataFrame). Its input, obviously, has to be a Series, since rows and columns are Series.

In [None]:
df_titanic.sample(5) # a reminder how df_titanic looks

In [60]:
df_titanic.loc[:,["Survived","Pclass","Age"]].apply(np.max,axis=0) #applying a numpy function

Survived     1.0
Pclass       3.0
Age         80.0
dtype: float64

In [None]:
def spread(ser):
    spread=np.max(ser)-np.min(ser)
    return spread

In [None]:
df_titanic.loc[:,["Survived","Pclass","Age"]].apply(spread,axis=0) # applying a home brewed function

In [None]:
df_titanic.loc[:,["Survived","Pclass","Age"]].apply(lambda x:np.max(x)-np.min(x),axis=0) 
#applying a local lambda function

In [None]:
def summary(ser):
    summary=pd.Series([np.max(ser),np.min(ser),spread(ser)],index=('max','min','spread'))
    return summary

In [None]:
df_titanic.loc[:,["Survived","Pclass","Age"]].apply(summary,axis=0) 
# a home brewed function returning a Series.

## Try! 

Continuing from the previous example, now create grades of three distinct courses for our ten students (invent three coursenames). Store this in a dataframe. As the fourth student got caught cheating on course 2, deduct 5 points from his grade. Produce a series containing the mean grade for each student.

In [61]:
grades1 = np.random.randint(0, 21, 10)
grades1_ser=pd.Series(grades1,index=['Tony','Steve','Thor','Bruce','Natasha','Clint','Pietro','Wanda','James','Vision'],name='math')
grades2 = np.random.randint(0, 21, 10)
grades2_ser=pd.Series(grades2,index=['Tony','Steve','Thor','Bruce','Natasha','Clint','Pietro','Wanda','James','Vision'],name='geography')
grades3 = np.random.randint(0, 21, 10)
grades3_ser=pd.Series(grades3,index=['Tony','Steve','Thor','Bruce','Natasha','Clint','Pietro','Wanda','James','Vision'],name='biology')


In [62]:
grades_df=pd.DataFrame({grades1_ser.name:grades1_ser,grades2_ser.name:grades2_ser,grades3_ser.name:grades3_ser})

In [63]:
grades_df

Unnamed: 0,math,geography,biology
Tony,4,11,2
Steve,18,18,17
Thor,2,3,2
Bruce,10,8,4
Natasha,16,5,1
Clint,7,8,3
Pietro,5,16,13
Wanda,18,17,14
James,5,7,18
Vision,14,12,17


In [64]:
grades_df.iloc[3,1]=grades_df.iloc[3,1]-5
grades_df

Unnamed: 0,math,geography,biology
Tony,4,11,2
Steve,18,18,17
Thor,2,3,2
Bruce,10,3,4
Natasha,16,5,1
Clint,7,8,3
Pietro,5,16,13
Wanda,18,17,14
James,5,7,18
Vision,14,12,17


In [65]:
grades_df['TotalPCT'] = grades_df.apply(lambda row: str(round(float(row[0]+row[1]+row[2])/3 / 20 * 100.0,2)) + '%',axis=1)

In [66]:
grades_df

Unnamed: 0,math,geography,biology,TotalPCT
Tony,4,11,2,28.33%
Steve,18,18,17,88.33%
Thor,2,3,2,11.67%
Bruce,10,3,4,28.33%
Natasha,16,5,1,36.67%
Clint,7,8,3,30.0%
Pietro,5,16,13,56.67%
Wanda,18,17,14,81.67%
James,5,7,18,50.0%
Vision,14,12,17,71.67%
