In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Numpy

## ndarray object: an n-dimensional array of homogeneous data types

### 1. NumPy arrays have a fixed size. Modifying the size means creating a new array. 
### 2. NumPy arrays must be of the same data type

### Make numpy array from Python List

In [44]:
x = [2, 3, 1, 0]
print(x, type(x))
lst = np.array(x)
print(lst, type(lst))

[2, 3, 1, 0] <class 'list'>
[2 3 1 0] <class 'numpy.ndarray'>


### Make numpy array from Python Tuple

In [45]:
x = (2, 3, 1, 0)
print(x, type(x))
lst = np.array(x)
print(lst, type(lst))

(2, 3, 1, 0) <class 'tuple'>
[2 3 1 0] <class 'numpy.ndarray'>


#### Data type casting from float to int

##### Truncated

In [35]:
np.int_([1,2,4.9])

array([1, 2, 4])

##### Rounded

In [34]:
np.int_(np.round([1,2,4.9]))

array([1, 2, 5])

#### Data type casting from int to float

In [36]:
np.float_([1,2,3])

array([1., 2., 3.])

### Create numpy array from range x to y(exclusive) by z

In [39]:
np.arange(10, 20)

array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

In [49]:
np.arange(10, 20, 2)

array([10, 12, 14, 16, 18])

### Create numpy array of zeros of shape (m,n)

In [47]:
np.zeros((2, 3))

array([[0., 0., 0.],
       [0., 0., 0.]])

### Create numpy array of ones of shape (m,n)

In [48]:
np.ones((2, 3))

array([[1., 1., 1.],
       [1., 1., 1.]])

### Create numpy array with n number of elements equally spaced between a and b (inclusive)

In [51]:
np.linspace(1., 4., 6)

array([1. , 1.6, 2.2, 2.8, 3.4, 4. ])

### Create numpy array of shape (m,n) with random values between [0,1)

In [52]:
np.random.random((2,3))

array([[0.83676739, 0.94107149, 0.66188436],
       [0.04676811, 0.07186612, 0.16518405]])

### Create numpy array of shape (m,n) filled with value x

In [60]:
np.full((2,3), 8)

array([[8, 8, 8],
       [8, 8, 8]])

### Create an identity Matrix of shape (n, n)

In [61]:
np.eye(5)

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])

### Reshape a numpy array to be (m,n)

In [53]:
np.arange(9).reshape(3,3)

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

### Display shape of numpy array

In [69]:
lst.reshape(2,2).shape

(2, 2)

### Display total number of elements in array

In [62]:
lst.size

4

### Convert numpy array to Python list (row-wise)

In [70]:
lst

array([0, 1, 2, 3])

In [63]:
lst.tolist()

[2, 3, 1, 0]

### Sort a numpy array

In [67]:
lst.sort
lst

array([0, 1, 2, 3])

### Transpose a numpy array

In [72]:
arr = np.random.random((2,3))
arr

array([[0.68131404, 0.48619011, 0.21336938],
       [0.7897404 , 0.97105652, 0.99595145]])

In [73]:
arr.T

array([[0.68131404, 0.7897404 ],
       [0.48619011, 0.97105652],
       [0.21336938, 0.99595145]])

# Pandas Series

### Essentially a numpy array that can store any heterogeneous data types.
#### 1. Contains index label and value

In [188]:
ser = pd.Series(['blue', 7, 'yellow'], index=['HANNAH', 2, 4])
ser

HANNAH      blue
2              7
4         yellow
dtype: object

### Display Series as numpy array

In [189]:
ser.values

array(['blue', 7, 'yellow'], dtype=object)

# Pandas Dataframe

## Creating a Dataframe from scratch

### Create Dataframe from dictionary:

In [15]:
df = pd.DataFrame({'id': [1, 2, 3, 4, 5],
                   'name': ['Alice', 'Bob', None, 'Chris', None]})
df

Unnamed: 0,id,name
0,1,Alice
1,2,Bob
2,3,
3,4,Chris
4,5,


### Create Dataframe from Numpy Array

#### Explicit

In [16]:
data = np.array([[5.8, 2.8], [6.0, 2.2]])
data

array([[5.8, 2.8],
       [6. , 2.2]])

In [17]:
dataset = pd.DataFrame({'Column1': data[:, 0], 'Column2': data[:, 1]})
dataset

Unnamed: 0,Column1,Column2
0,5.8,2.8
1,6.0,2.2


#### Implicit

In [24]:
data = np.array([[5.8, 2.8, 7.9, 82], [6.0, 2.2, 43, 54], [8.9, 2.2, 43, 54]])
data

array([[ 5.8,  2.8,  7.9, 82. ],
       [ 6. ,  2.2, 43. , 54. ],
       [ 8.9,  2.2, 43. , 54. ]])

In [25]:
pd.DataFrame(data=data[1:,1:],    # values
             index=data[1:,0],    # 1st column as index
             columns=data[0,1:])  # 1st row as the column names

Unnamed: 0,2.8,7.9,82.0
6.0,2.2,43.0,54.0
8.9,2.2,43.0,54.0


## Reading Data from file

In [80]:
df = pd.read_csv("titanic.csv")

### Additional functions include:
#### pd.read_csv()
#### pd.read_excel()
#### pd.read_json()
#### pd.read_sql()

#### To save a dataframe locally as a csv file:
##### df.to_csv('name.csv')

## Preview the Dataframe

### First 5 rows

In [81]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Last 5 rows

In [82]:
df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


### List of column names

In [83]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

### Name, range, and step size of index

In [84]:
df.index

RangeIndex(start=0, stop=891, step=1)

### Data types of each column

In [85]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

### Shape of dataframe (m x n) [rows by columns]

In [86]:
df.shape

(891, 12)

### Describe boxplot stats for numerical values in dataframe

In [87]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### Ask if values are Null or not

In [88]:
df.isna()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,True,False
887,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,False,False,True,False,False,False,False,True,False
889,False,False,False,False,False,False,False,False,False,False,False,False


### Select 1 Column (as Pandas Series)

In [101]:
df['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

### Select multiple Columns (as Dataframe)

In [103]:
df[['Age', 'Sex']]

Unnamed: 0,Age,Sex
0,22.0,male
1,38.0,female
2,26.0,female
3,35.0,female
4,35.0,male
...,...,...
886,27.0,male
887,19.0,female
888,,female
889,26.0,male


### Frequency for each unique entry in column

In [104]:
df['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

### Select only those passengers that are male

In [114]:
df[df['Sex'] == 'male']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5000,,S
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### Find average age of passengers who survived and didn't

In [192]:
df.groupby('Survived')['Age'].mean()

Survived
0    30.626179
1    28.343690
Name: Age, dtype: float64

### Drop a column from the Dataframe

In [125]:
df2 = df.copy()
df2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [131]:
df2.drop(['PassengerId'], axis = 1)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### Set the index of the dataframe to a Pandas Series

In [154]:
df2.set_index(['Survived'], inplace=True)
df2

Unnamed: 0_level_0,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,3,"Braund, Mr. Owen Harris",22,1,0,A/5 21171,7.2500,ffill,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38,1,0,PC 17599,71.2833,C85,C
1,3,"Heikkinen, Miss. Laina",26,0,0,STON/O2. 3101282,7.9250,ffill,S
1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35,1,0,113803,53.1000,C123,S
0,3,"Allen, Mr. William Henry",35,0,0,373450,8.0500,ffill,S
...,...,...,...,...,...,...,...,...,...
0,2,"Montvila, Rev. Juozas",27,0,0,211536,13.0000,ffill,S
1,1,"Graham, Miss. Margaret Edith",19,0,0,112053,30.0000,B42,S
0,3,"Johnston, Miss. Catherine Helen ""Carrie""",ffill,1,2,W./C. 6607,23.4500,ffill,S
1,1,"Behr, Mr. Karl Howell",26,0,0,111369,30.0000,C148,C


### Reset the index of the Dataframe

In [155]:
df2.reset_index()

Unnamed: 0,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",22,1,0,A/5 21171,7.2500,ffill,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",26,0,0,STON/O2. 3101282,7.9250,ffill,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35,1,0,113803,53.1000,C123,S
4,0,3,"Allen, Mr. William Henry",35,0,0,373450,8.0500,ffill,S
...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",27,0,0,211536,13.0000,ffill,S
887,1,1,"Graham, Miss. Margaret Edith",19,0,0,112053,30.0000,B42,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",ffill,1,2,W./C. 6607,23.4500,ffill,S
889,1,1,"Behr, Mr. Karl Howell",26,0,0,111369,30.0000,C148,C


### Drop all rows with missing data

In [134]:
df2.dropna()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


### Drop all columns with missing data

In [136]:
df2.dropna(axis=1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare
0,1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.2500
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833
2,3,1,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.9250
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1000
4,5,0,3,"Allen, Mr. William Henry",male,0,0,373450,8.0500
...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,0,0,211536,13.0000
887,888,1,1,"Graham, Miss. Margaret Edith",female,0,0,112053,30.0000
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,1,2,W./C. 6607,23.4500
889,890,1,1,"Behr, Mr. Karl Howell",male,0,0,111369,30.0000


### Forward Fill missing data. (Take last observation as the missing observation)

In [140]:
df2.fillna('ffill', inplace=True)
df2

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.2500,ffill,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.9250,ffill,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.0500,ffill,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13.0000,ffill,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,ffill,1,2,W./C. 6607,23.4500,ffill,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30.0000,C148,C


### Count number of nan values in each column

In [143]:
df2.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

### Rename a Column or multiple columns

In [162]:
df2.rename(columns={'Ticket':'Tick', 'Pclass': 'Passenger Class'})

Unnamed: 0_level_0,Passenger Class,Name,Age,SibSp,Parch,Tick,Fare,Cabin,Embarked
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,3,"Braund, Mr. Owen Harris",22,1,0,A/5 21171,7.2500,ffill,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38,1,0,PC 17599,71.2833,C85,C
1,3,"Heikkinen, Miss. Laina",26,0,0,STON/O2. 3101282,7.9250,ffill,S
1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35,1,0,113803,53.1000,C123,S
0,3,"Allen, Mr. William Henry",35,0,0,373450,8.0500,ffill,S
...,...,...,...,...,...,...,...,...,...
0,2,"Montvila, Rev. Juozas",27,0,0,211536,13.0000,ffill,S
1,1,"Graham, Miss. Margaret Edith",19,0,0,112053,30.0000,B42,S
0,3,"Johnston, Miss. Catherine Helen ""Carrie""",ffill,1,2,W./C. 6607,23.4500,ffill,S
1,1,"Behr, Mr. Karl Howell",26,0,0,111369,30.0000,C148,C


### Apply a function to each element in a Series

In [165]:
df2['Parch'].apply(lambda x: x + 5)

Survived
0    5
1    5
1    5
1    5
0    5
    ..
0    5
1    5
0    7
1    5
0    5
Name: Parch, Length: 891, dtype: int64

### Slice the Dataframe by index (get rows 0 up to 6)

In [179]:
df2.head()

Unnamed: 0_level_0,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,3,"Braund, Mr. Owen Harris",22,1,0,A/5 21171,7.25,ffill,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38,1,0,PC 17599,71.2833,C85,C
1,3,"Heikkinen, Miss. Laina",26,0,0,STON/O2. 3101282,7.925,ffill,S
1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35,1,0,113803,53.1,C123,S
0,3,"Allen, Mr. William Henry",35,0,0,373450,8.05,ffill,S


In [180]:
df2.iloc[0:6]

Unnamed: 0_level_0,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,3,"Braund, Mr. Owen Harris",22,1,0,A/5 21171,7.25,ffill,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38,1,0,PC 17599,71.2833,C85,C
1,3,"Heikkinen, Miss. Laina",26,0,0,STON/O2. 3101282,7.925,ffill,S
1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35,1,0,113803,53.1,C123,S
0,3,"Allen, Mr. William Henry",35,0,0,373450,8.05,ffill,S
0,3,"Moran, Mr. James",ffill,0,0,330877,8.4583,ffill,Q


### Slice the Dataframe by index value (get all rows where the index (Survived) is 0)

In [181]:
df2.loc[0]

Unnamed: 0_level_0,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,3,"Braund, Mr. Owen Harris",22,1,0,A/5 21171,7.2500,ffill,S
0,3,"Allen, Mr. William Henry",35,0,0,373450,8.0500,ffill,S
0,3,"Moran, Mr. James",ffill,0,0,330877,8.4583,ffill,Q
0,1,"McCarthy, Mr. Timothy J",54,0,0,17463,51.8625,E46,S
0,3,"Palsson, Master. Gosta Leonard",2,3,1,349909,21.0750,ffill,S
...,...,...,...,...,...,...,...,...,...
0,3,"Sutehall, Mr. Henry Jr",25,0,0,SOTON/OQ 392076,7.0500,ffill,S
0,3,"Rice, Mrs. William (Margaret Norton)",39,0,5,382652,29.1250,ffill,Q
0,2,"Montvila, Rev. Juozas",27,0,0,211536,13.0000,ffill,S
0,3,"Johnston, Miss. Catherine Helen ""Carrie""",ffill,1,2,W./C. 6607,23.4500,ffill,S


### Check a condition on a column

In [184]:
df2['Fare'] > 8

Survived
0    False
1     True
1    False
1     True
0     True
     ...  
0     True
1     True
0     True
1     True
0    False
Name: Fare, Length: 891, dtype: bool

### Subset the dataframe based on a condition

In [185]:
df2[df2['Fare'] > 8]

Unnamed: 0_level_0,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38,1,0,PC 17599,71.2833,C85,C
1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35,1,0,113803,53.1000,C123,S
0,3,"Allen, Mr. William Henry",35,0,0,373450,8.0500,ffill,S
0,3,"Moran, Mr. James",ffill,0,0,330877,8.4583,ffill,Q
0,1,"McCarthy, Mr. Timothy J",54,0,0,17463,51.8625,E46,S
...,...,...,...,...,...,...,...,...,...
0,3,"Rice, Mrs. William (Margaret Norton)",39,0,5,382652,29.1250,ffill,Q
0,2,"Montvila, Rev. Juozas",27,0,0,211536,13.0000,ffill,S
1,1,"Graham, Miss. Margaret Edith",19,0,0,112053,30.0000,B42,S
0,3,"Johnston, Miss. Catherine Helen ""Carrie""",ffill,1,2,W./C. 6607,23.4500,ffill,S


### If you ever need information about a function that you don't know do this:

In [167]:
pd.concat?

[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mconcat[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mobjs[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mIterable[0m[0;34m[[0m[0mUnion[0m[0;34m[[0m[0mForwardRef[0m[0;34m([0m[0;34m'DataFrame'[0m[0;34m)[0m[0;34m,[0m [0mForwardRef[0m[0;34m([0m[0;34m'Series'[0m[0;34m)[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mMapping[0m[0;34m[[0m[0mUnion[0m[0;34m[[0m[0mHashable[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m[0;34m,[0m [0mUnion[0m[0;34m[[0m[0mForwardRef[0m[0;34m([0m[0;34m'DataFrame'[0m[0;34m)[0m[0;34m,[0m [0mForwardRef[0m[0;34m([0m[0;34m'Series'[0m[0;34m)[0m[0;34m][0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mjoin[0m[0;34m=[0m[0;34m'outer'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mignore_index[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0