# 1. Pandas DataFrame Basics

### Load Your First Data Set

In [4]:
import pandas as pd
df = pd.read_csv('./data/gapminder.tsv', sep='\t')

In [5]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [6]:
# get the number of rows and columns
print(df.shape)

(1704, 6)


In [9]:
# get the column names
print(df.columns)

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')


In [10]:
# get the dtypes of each column
print(df.dtypes)

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object


In [11]:
# get more information about our data
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB
None


### Look at Columns, Rows, and Cells

In [12]:
# show the first 5 observations
print(df.head())

       country continent  year  lifeExp       pop   gdpPercap
0  Afghanistan      Asia  1952   28.801   8425333  779.445314
1  Afghanistan      Asia  1957   30.332   9240934  820.853030
2  Afghanistan      Asia  1962   31.997  10267083  853.100710
3  Afghanistan      Asia  1967   34.020  11537966  836.197138
4  Afghanistan      Asia  1972   36.088  13079460  739.981106


#### Select and Subset Columns by Name

In [13]:
# just get the country column and save it to its own variable
country_df = df['country']

In [14]:
# show the first 5 observations
print(country_df.head())

0    Afghanistan
1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: country, dtype: object


In [15]:
# show the last 5 observations
print(country_df.tail())

1699    Zimbabwe
1700    Zimbabwe
1701    Zimbabwe
1702    Zimbabwe
1703    Zimbabwe
Name: country, dtype: object


In [16]:
# Looking at country, continent, and year
subset = df[['country', 'continent', 'year']]
print(subset)

          country continent  year
0     Afghanistan      Asia  1952
1     Afghanistan      Asia  1957
2     Afghanistan      Asia  1962
3     Afghanistan      Asia  1967
4     Afghanistan      Asia  1972
...           ...       ...   ...
1699     Zimbabwe    Africa  1987
1700     Zimbabwe    Africa  1992
1701     Zimbabwe    Africa  1997
1702     Zimbabwe    Africa  2002
1703     Zimbabwe    Africa  2007

[1704 rows x 3 columns]


Single Value Returns DataFrame or Series

In [18]:
country_df = df['country']
print(type(country_df))

<class 'pandas.core.series.Series'>


In [20]:
country_df_list = df[['country']] # note the double square brackets
print(type(country_df_list))

<class 'pandas.core.frame.DataFrame'>


Using Dot Notation to Pull a Column of Values

In [21]:
# using square bracket notation
print(df['country'])

0       Afghanistan
1       Afghanistan
2       Afghanistan
3       Afghanistan
4       Afghanistan
           ...     
1699       Zimbabwe
1700       Zimbabwe
1701       Zimbabwe
1702       Zimbabwe
1703       Zimbabwe
Name: country, Length: 1704, dtype: object


In [22]:
# using dot notation
print(df.country)

0       Afghanistan
1       Afghanistan
2       Afghanistan
3       Afghanistan
4       Afghanistan
           ...     
1699       Zimbabwe
1700       Zimbabwe
1701       Zimbabwe
1702       Zimbabwe
1703       Zimbabwe
Name: country, Length: 1704, dtype: object


#### Subset Rows

Subset Rows by index Label - _.loc[]_

In [23]:
# get the first row
# python counts from 0
print(df.loc[0])

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap     779.445314
Name: 0, dtype: object


In [24]:
# get the 100th row
# python counts from 0
print(df.loc[99])

country      Bangladesh
continent          Asia
year               1967
lifeExp          43.453
pop            62821884
gdpPercap    721.186086
Name: 99, dtype: object


In [27]:
# get the last row
# this will cause an error
# print(df.loc[-1])

In [28]:
# get the last row (correctly)

# use the first value from shape to get the number of rows
number_of_rows = df.shape[0]

# subtract 1 from the value since we want the last index value
last_row_index = number_of_rows - 1

# finally do the subset using the index of the last row
print(df.loc[last_row_index])

country        Zimbabwe
continent        Africa
year               2007
lifeExp          43.487
pop            12311143
gdpPercap    469.709298
Name: 1703, dtype: object


In [29]:
# there are many ways of doing what you want
print(df.tail(n=1))

       country continent  year  lifeExp       pop   gdpPercap
1703  Zimbabwe    Africa  2007   43.487  12311143  469.709298


In [30]:
# get the last row of data in different ways
subset_loc = df.loc[0]
subset_head = df.head(n=1)

# type using loc of 1 row
print(type(subset_loc))

# type using head of 1 row
print(type(subset_head))

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
