# Exercises on pandas Basics

## 1. Getting Started
We first import `pandas` and load a table into a DataFrame.

In [1]:
import pandas as pd

countries = pd.read_csv('large_countries_2015.csv', index_col=0)

In [2]:
%matplotlib inline

## 2. Working with DataFrames
To view the contents of a data frame, type its name:

In [3]:
countries

Unnamed: 0,population,fertility,continent
Bangladesh,160995600.0,2.12,Asia
Brazil,207847500.0,1.78,South America
China,1376049000.0,1.57,Asia
India,1311051000.0,2.43,Asia
Indonesia,257563800.0,2.28,Asia
Japan,126573500.0,1.45,Asia
Mexico,127017200.0,2.13,North America
Nigeria,182202000.0,5.89,Africa
Pakistan,188924900.0,3.04,Asia
Philippines,100699400.0,2.98,Asia


## 3. Examining DataFrames
Match the Python commands with the descriptions below. 

*In Jupyter, you can move the descriptions up/down with the arrow buttons.*

In [4]:
countries.head(3)

Unnamed: 0,population,fertility,continent
Bangladesh,160995600.0,2.12,Asia
Brazil,207847500.0,1.78,South America
China,1376049000.0,1.57,Asia


In [5]:
countries.tail(3)

Unnamed: 0,population,fertility,continent
Philippines,100699395.0,2.98,Asia
Russia,143456918.0,1.61,Europe
United States,321773631.0,1.97,North America


In [6]:
countries.describe()

Unnamed: 0,population,fertility
count,12.0,12.0
mean,375346200.0,2.4375
std,456519400.0,1.200781
min,100699400.0,1.45
25%,139347000.0,1.7375
50%,185563400.0,2.125
75%,273616300.0,2.5675
max,1376049000.0,5.89


In [7]:
countries['population'].mean()

375346161.6666667

In [8]:
countries['continent'].value_counts()

Asia             7
North America    2
South America    1
Africa           1
Europe           1
Name: continent, dtype: int64

In [9]:
countries.shape

(12, 3)

In [10]:
countries['continent'].unique()

array(['Asia', 'South America', 'North America', 'Africa', 'Europe'],
      dtype=object)

In [11]:
countries['population'] // 1000000

Bangladesh        160.0
Brazil            207.0
China            1376.0
India            1311.0
Indonesia         257.0
Japan             126.0
Mexico            127.0
Nigeria           182.0
Pakistan          188.0
Philippines       100.0
Russia            143.0
United States     321.0
Name: population, dtype: float64

#### Number of rows and columns

In [12]:
countries.shape

(12, 3)

#### Show the last 3 lines

In [13]:
countries.tail(3)

Unnamed: 0,population,fertility,continent
Philippines,100699395.0,2.98,Asia
Russia,143456918.0,1.61,Europe
United States,321773631.0,1.97,North America


#### Summarize categorical data

In [15]:
countries.describe()

Unnamed: 0,population,fertility
count,12.0,12.0
mean,375346200.0,2.4375
std,456519400.0,1.200781
min,100699400.0,1.45
25%,139347000.0,1.7375
50%,185563400.0,2.125
75%,273616300.0,2.5675
max,1376049000.0,5.89


#### Mean of a column

In [19]:
countries["fertility"].mean()

2.4374999999999996

#### Summarize all numerical columns

#### Show the first 3 lines

#### Apply a calculation to each value in a column

#### Extract distinct values

## 4. Selecting rows and columns
Match the Python commands with the descriptions below. 

In [None]:
countries.columns

In [None]:
countries.index

In [None]:
countries['continent']

In [None]:
countries[['population', 'continent']]

In [None]:
countries.loc['India']

In [None]:
countries.iloc[3:7]

In [None]:
countries[countries['population'] > 200000000]

In [None]:
countries.values

#### Extract raw data as a NumPy array

#### Select rows by slicing the index

#### Filter rows by a condition

#### Display column labels

#### Select multiple columns

#### Display row index

#### Select row by an index value

#### Select one column

## 5. Summarizing Data
Match the Python commands with the descriptions below. 

In [None]:
countries['fertility'].cumsum()

In [None]:
countries.groupby('continent')['population'].sum()

In [None]:
countries.sort_values(by=['continent', 'fertility'])

In [None]:
def get_initial(s):
    return s[0]

countries['initial'] = countries['continent'].apply(get_initial)
countries

In [None]:
countries.stack()

In [None]:
countries.transpose()

In [None]:
countries['fertility'].hist()

In [None]:
countries.plot('population', 'fertility', style='ro')

#### Draw a scatterplot

#### Move columns to a new index level

#### Create a new column using a function

#### Draw a histogram

#### Cumulatively apply a sum over a column

#### Swap rows and columns

#### Calculate sum of one column grouped by a second one

#### Sort values

## License
(c) 2017 Kristian Rother
Distributed under the conditions of the MIT License.