In [2]:
import pandas as pd

pd.__version__

'1.1.5'

# Create data frames manually

In [16]:
# Using dictionary, initialize row/cols with dictionary
df1 = pd.DataFrame(
        data={'Col1': [1, 2, 3]
             , 'Col2': [4, 5, 6]
             , 'Col3': [7, 8, 9]}
        , index=[1, 2, 3] # can be labels
        , columns=['Col3', 'Col2', 'Col1']) # re-order
df1

Unnamed: 0,Col3,Col2,Col1
1,7,4,1
2,8,5,2
3,9,6,3


## Read from file

In [18]:
df = pd.read_csv("./data/gapminder.tsv", sep='\t')

In [3]:
df.head(5)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [4]:
type(df)

pandas.core.frame.DataFrame

In [7]:
# Get size rowsxcols, returns tuple
df.shape

(1704, 6)

In [8]:
# Returns series
df.columns

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

In [9]:
# Returns range
df.index

RangeIndex(start=0, stop=1704, step=1)

In [10]:
# Returns numpy array
df.values

array([['Afghanistan', 'Asia', 1952, 28.801, 8425333, 779.4453145],
       ['Afghanistan', 'Asia', 1957, 30.331999999999997, 9240934,
        820.8530296],
       ['Afghanistan', 'Asia', 1962, 31.997, 10267083, 853.1007099999999],
       ...,
       ['Zimbabwe', 'Africa', 1997, 46.809, 11404948, 792.4499602999999],
       ['Zimbabwe', 'Africa', 2002, 39.989000000000004, 11926563,
        672.0386227000001],
       ['Zimbabwe', 'Africa', 2007, 43.486999999999995, 12311143,
        469.70929810000007]], dtype=object)

In [12]:
# Dataframe schema
df.dtypes

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

In [13]:
# Get data profile
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
country      1704 non-null object
continent    1704 non-null object
year         1704 non-null int64
lifeExp      1704 non-null float64
pop          1704 non-null int64
gdpPercap    1704 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


In [19]:
# Profile statistics
df.describe()

Unnamed: 0,year,lifeExp,pop,gdpPercap
count,1704.0,1704.0,1704.0,1704.0
mean,1979.5,59.474439,29601210.0,7215.327081
std,17.26533,12.917107,106157900.0,9857.454543
min,1952.0,23.599,60011.0,241.165877
25%,1965.75,48.198,2793664.0,1202.060309
50%,1979.5,60.7125,7023596.0,3531.846989
75%,1993.25,70.8455,19585220.0,9325.462346
max,2007.0,82.603,1318683000.0,113523.1329


In [14]:
# Sub-setting - this returns a series
df['country']

0       Afghanistan
1       Afghanistan
2       Afghanistan
3       Afghanistan
4       Afghanistan
           ...     
1699       Zimbabwe
1700       Zimbabwe
1701       Zimbabwe
1702       Zimbabwe
1703       Zimbabwe
Name: country, Length: 1704, dtype: object

In [6]:
# Sub-setting - this returns a DF
df[['country']].head()

Unnamed: 0,country
0,Afghanistan
1,Afghanistan
2,Afghanistan
3,Afghanistan
4,Afghanistan


# Use loc over iloc

In [28]:
# Sub-setting rows by label
df.loc[[0, 1, 2, 3, 4, 5, 6, 7]] # or .loc[0:7]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
5,Afghanistan,Asia,1977,38.438,14880372,786.11336
6,Afghanistan,Asia,1982,39.854,12881816,978.011439
7,Afghanistan,Asia,1987,40.822,13867957,852.395945


In [29]:
# Sub-setting rows by index
df.iloc[[0, 1, 2, 3, 4, 5, 6, 7]] # or .iloc[0:7]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
5,Afghanistan,Asia,1977,38.438,14880372,786.11336
6,Afghanistan,Asia,1982,39.854,12881816,978.011439
7,Afghanistan,Asia,1987,40.822,13867957,852.395945


In [35]:
# Subset rows and cols
df.loc[:5,['country', 'continent', 'year']] # or iloc[[],[]]

Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972
5,Afghanistan,Asia,1977


In [37]:
# Subset rows and cols
df.iloc[:7,0:3] # or iloc[[],[]]

Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972
5,Afghanistan,Asia,1977
6,Afghanistan,Asia,1982


In [9]:
# Subset rows by filter, and cols
df.loc[df.country == 'Zimbabwe', :]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1692,Zimbabwe,Africa,1952,48.451,3080907,406.884115
1693,Zimbabwe,Africa,1957,50.469,3646340,518.764268
1694,Zimbabwe,Africa,1962,52.358,4277736,527.272182
1695,Zimbabwe,Africa,1967,53.995,4995432,569.795071
1696,Zimbabwe,Africa,1972,55.635,5861135,799.362176
1697,Zimbabwe,Africa,1977,57.674,6642107,685.587682
1698,Zimbabwe,Africa,1982,60.363,7636524,788.855041
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.44996


# Filtering

In [4]:
scientists = pd.read_csv('./data/scientists.csv')
scientists.head(5)

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist


In [8]:
# mean() on a series returns a 1 element series
ages = scientists.loc[scientists.Age > 50, ['Age']] # or .['Age']
ages[(ages > ages.mean()) & ~(ages > 75)] # need bitwise operators

Unnamed: 0,Age
1,
2,
3,
4,
7,


In [11]:
# Boolean way
scientists.loc[scientists.Age > scientists.Age.mean(), ['Name', 'Age']]

# or: scientists[scientists.Age > scientists.Age.mean()]

Unnamed: 0,Name,Age
1,William Gosset,61
2,Florence Nightingale,90
3,Marie Curie,66
7,Johann Gauss,77


## Data conversion

In [20]:
born_datetime = pd.to_datetime(scientists.Born, format='%Y-%m-%d')
born_datetime

0   1920-07-25
1   1876-06-13
2   1820-05-12
3   1867-11-07
4   1907-05-27
5   1813-03-15
6   1912-06-23
7   1777-04-30
Name: Born, dtype: datetime64[ns]

# Split multi-valued columns as values

In [12]:
ebola = pd.read_csv('./data/country_timeseries.csv')
ebola.head(5)

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
3,1/2/2015,286,,8157.0,,,,,,,,3496.0,,,,,,
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,


In [13]:
ebola_melt = ebola.melt(
            id_vars=['Date', 'Day']
            , value_vars=None # defaults to non-id_vars
            , var_name='val_col'
            , value_name='count'
        )

ebola_melt.loc[ebola_melt.Date == '12/31/2014']

Unnamed: 0,Date,Day,val_col,count
4,12/31/2014,284,Cases_Guinea,2730.0
126,12/31/2014,284,Cases_Liberia,8115.0
248,12/31/2014,284,Cases_SierraLeone,9633.0
370,12/31/2014,284,Cases_Nigeria,
492,12/31/2014,284,Cases_Senegal,
614,12/31/2014,284,Cases_UnitedStates,
736,12/31/2014,284,Cases_Spain,
858,12/31/2014,284,Cases_Mali,
980,12/31/2014,284,Deaths_Guinea,1739.0
1102,12/31/2014,284,Deaths_Liberia,3471.0


In [14]:
# Single assignment
ebola_melt['cases'] = ebola_melt.val_col.str.split('_').str.get(0)
ebola_melt['country'] = ebola_melt.val_col.str.split('_').str[1]
ebola_melt.sample(5)

Unnamed: 0,Date,Day,val_col,count,cases,country
1090,3/31/2014,9,Deaths_Guinea,80.0,Deaths,Guinea
1116,11/22/2014,245,Deaths_Liberia,3016.0,Deaths,Liberia
621,12/14/2014,267,Cases_UnitedStates,,Cases,UnitedStates
1758,9/19/2014,181,Deaths_Spain,,Deaths,Spain
1727,11/18/2014,241,Deaths_Spain,0.0,Deaths,Spain


In [15]:
# Multiple assignment
# expand=True, returns multiple column series
ebola_melt[['cases', 'country']] = ebola_melt.val_col.str.split('_', expand=True)
ebola_melt.sample(5)

Unnamed: 0,Date,Day,val_col,count,cases,country
60,8/20/2014,151,Cases_Guinea,607.0,Cases,Guinea
1289,7/30/2014,129,Deaths_SierraLeone,252.0,Deaths,SierraLeone
65,8/9/2014,140,Cases_Guinea,506.0,Cases,Guinea
1487,11/10/2014,233,Deaths_Senegal,,Deaths,Senegal
369,1/2/2015,286,Cases_Nigeria,,Cases,Nigeria


# Group calculations

In [40]:
# Get a random sample
df.sample(5)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1556,Trinidad and Tobago,Americas,1992,69.862,1183669,7370.990932
19,Albania,Europe,1987,72.0,3075321,3738.932735
852,Kuwait,Asia,1952,55.565,160000,108382.3529
1245,Portugal,Europe,1997,75.97,10156415,17641.03156
640,Haiti,Americas,1972,48.042,4698301,1654.456946


In [50]:
# groupby returns DataFrameGroupBy object
df.groupby('year')['lifeExp'].mean() # or lifeExp.mean()

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [60]:
# Multi groupby returns DataFrameGroupBy object
# Aggregate is applied to each column for each group
grouped = df.groupby(['year', 'continent'])['lifeExp', 'gdpPercap'].mean()

In [None]:
# Index is a multi-index
grouped.index

In [59]:
grouped.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,lifeExp,gdpPercap
year,continent,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,Africa,39.1355,1252.572466
1952,Americas,53.27984,4079.062552
1952,Asia,46.314394,5195.484004
1952,Europe,64.4085,5661.057435
1952,Oceania,69.255,10298.08565
1957,Africa,41.266346,1385.236062
1957,Americas,55.96028,4616.043733
1957,Asia,49.318544,5787.73294
1957,Europe,66.703067,6963.012816
1957,Oceania,70.295,11598.522455


In [62]:
# To flatten multi-index into single-index
flat_df = grouped.reset_index()
flat_df.head()

Unnamed: 0,year,continent,lifeExp,gdpPercap
0,1952,Africa,39.1355,1252.572466
1,1952,Americas,53.27984,4079.062552
2,1952,Asia,46.314394,5195.484004
3,1952,Europe,64.4085,5661.057435
4,1952,Oceania,69.255,10298.08565
