In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# This command makes charts show inline in a notebook
%matplotlib inline
plt.rcParams['figure.figsize'] = [10,6]

In [2]:
tx = pd.read_csv('./mock_treatment_starts_2016.csv')

In [3]:
# Just using the name of the dataframe will print the entire output
# If there are too many rows, Jupyter will print the top few and bottom few rows
# with a "..." to indicate that there are more rows
tx

Unnamed: 0,PatientID,TreatmentStart,Drug,Dosage
0,PT1,1/14/16,Cisplatin,200
1,PT20,1/2/16,Cisplatin,140
2,PT2,1/10/16,Cisplatin,180
3,PT3,1/24/16,Cisplatin,140
4,PT4,2/14/16,Cisplatin,200
5,PT19,2/10/16,Cisplatin,180
6,PT5,2/6/16,Cisplatin,190
7,PT6,3/1/16,Cisplatin,180
8,PT7,3/1/16,Cisplatin,210
9,PT8,3/19/16,Cisplatin,180


In [4]:
# The head() function shows the first n rows in a dataframe.
tx.head()

Unnamed: 0,PatientID,TreatmentStart,Drug,Dosage
0,PT1,1/14/16,Cisplatin,200
1,PT20,1/2/16,Cisplatin,140
2,PT2,1/10/16,Cisplatin,180
3,PT3,1/24/16,Cisplatin,140
4,PT4,2/14/16,Cisplatin,200


In [5]:
# You can also use the sample() function to get n random rows in the dataframe
# NOTE: sample() only works in newer versions of pandas (0.16.1 and upwards)
tx.sample(5)

Unnamed: 0,PatientID,TreatmentStart,Drug,Dosage
13,PT16,4/9/16,Cisplatin,160
2,PT2,1/10/16,Cisplatin,180
20,PT18,6/3/16,Nivolumab,240
1,PT20,1/2/16,Cisplatin,140
16,PT14,5/3/16,Nivolumab,240


In [6]:
# Then len function gives us the number of rows in the dataframe
len(tx)

23

In [7]:
# The dtypes property of a dataframe shows the datatypes of every column in a dataframe.
tx.dtypes

PatientID         object
TreatmentStart    object
Drug              object
Dosage             int64
dtype: object

In [8]:
tx.PatientID.head()

0     PT1
1    PT20
2     PT2
3     PT3
4     PT4
Name: PatientID, dtype: object

In [9]:
# Check the type to show that this indeed returns a Series object
type(tx.PatientID)

pandas.core.series.Series

In [10]:
# The alternative notation for accessing a column in a dataframe
# Some people prefer the . notation, others the [] notation.
tx['PatientID'].head()

0     PT1
1    PT20
2     PT2
3     PT3
4     PT4
Name: PatientID, dtype: object

In [12]:
# And this is how you access two columns of a dataframe.
# Note that this will return a dataframe again, not a series (because a series has only one column...)
# Also note the double square brackets - you're passing a *list* as an argument
tx[['PatientID', 'Dosage']]

Unnamed: 0,PatientID,Dosage
0,PT1,200
1,PT20,140
2,PT2,180
3,PT3,140
4,PT4,200
5,PT19,180
6,PT5,190
7,PT6,180
8,PT7,210
9,PT8,180


In [13]:
# Access the diagnosis record(s) for a specific patient ID
tx.loc[tx['PatientID'] == 'PT5']

Unnamed: 0,PatientID,TreatmentStart,Drug,Dosage
6,PT5,2/6/16,Cisplatin,190


In [14]:
# This is equivalent with the following shorter notation
tx[(tx['PatientID'] == 'PT20') & (tx['Drug'] == 'Cisplatin')]

Unnamed: 0,PatientID,TreatmentStart,Drug,Dosage
1,PT20,1/2/16,Cisplatin,140
