# Exercises on pandas Basics

## 1. Getting Started
We first import `pandas` and load a table into a DataFrame.

In [1]:
import pandas as pd

countries = pd.read_csv('large_countries_2015.csv', index_col=0)

In [2]:
%matplotlib inline

## 2. Working with DataFrames
To view the contents of a data frame, type its name:

In [3]:
countries

Unnamed: 0,population,fertility,continent
Bangladesh,160995600.0,2.12,Asia
Brazil,207847500.0,1.78,South America
China,1376049000.0,1.57,Asia
India,1311051000.0,2.43,Asia
Indonesia,257563800.0,2.28,Asia
Japan,126573500.0,1.45,Asia
Mexico,127017200.0,2.13,North America
Nigeria,182202000.0,5.89,Africa
Pakistan,188924900.0,3.04,Asia
Philippines,100699400.0,2.98,Asia


## 3. Examining DataFrames
Match the Python commands with the descriptions below. 

*In Jupyter, you can move the descriptions up/down with the arrow buttons.*

In [4]:
countries.head(3)

Unnamed: 0,population,fertility,continent
Bangladesh,160995600.0,2.12,Asia
Brazil,207847500.0,1.78,South America
China,1376049000.0,1.57,Asia


In [5]:
countries.tail(3)

Unnamed: 0,population,fertility,continent
Philippines,100699395.0,2.98,Asia
Russia,143456918.0,1.61,Europe
United States,321773631.0,1.97,North America


In [6]:
countries.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12 entries, Bangladesh to United States
Data columns (total 3 columns):
population    12 non-null float64
fertility     12 non-null float64
continent     12 non-null object
dtypes: float64(2), object(1)
memory usage: 384.0+ bytes


In [7]:
countries.describe()

Unnamed: 0,population,fertility
count,12.0,12.0
mean,375346200.0,2.4375
std,456519400.0,1.200781
min,100699400.0,1.45
25%,139347000.0,1.7375
50%,185563400.0,2.125
75%,273616300.0,2.5675
max,1376049000.0,5.89


In [8]:
countries.index

Index(['Bangladesh', 'Brazil', 'China', 'India', 'Indonesia', 'Japan',
       'Mexico', 'Nigeria', 'Pakistan', 'Philippines', 'Russia',
       'United States'],
      dtype='object')

In [9]:
countries.columns

Index(['population', 'fertility', 'continent'], dtype='object')

In [10]:
countries=countries.reset_index()
countries["country"]=countries["index"]
countries

Unnamed: 0,index,population,fertility,continent,country
0,Bangladesh,160995600.0,2.12,Asia,Bangladesh
1,Brazil,207847500.0,1.78,South America,Brazil
2,China,1376049000.0,1.57,Asia,China
3,India,1311051000.0,2.43,Asia,India
4,Indonesia,257563800.0,2.28,Asia,Indonesia
5,Japan,126573500.0,1.45,Asia,Japan
6,Mexico,127017200.0,2.13,North America,Mexico
7,Nigeria,182202000.0,5.89,Africa,Nigeria
8,Pakistan,188924900.0,3.04,Asia,Pakistan
9,Philippines,100699400.0,2.98,Asia,Philippines


In [25]:
#del countries['index']
countries = countries.drop('index', axis=1)

In [26]:
countries

Unnamed: 0,population,fertility,continent,country
0,160995600.0,2.12,Asia,Bangladesh
1,207847500.0,1.78,South America,Brazil
2,1376049000.0,1.57,Asia,China
3,1311051000.0,2.43,Asia,India
4,257563800.0,2.28,Asia,Indonesia
5,126573500.0,1.45,Asia,Japan
6,127017200.0,2.13,North America,Mexico
7,182202000.0,5.89,Africa,Nigeria
8,188924900.0,3.04,Asia,Pakistan
9,100699400.0,2.98,Asia,Philippines


In [11]:
countries['population'].mean()

375346161.6666667

In [12]:
countries['continent'].value_counts()

Asia             7
North America    2
Africa           1
South America    1
Europe           1
Name: continent, dtype: int64

In [13]:
countries.shape
#rows first=0
#then columns=1

(12, 5)

In [14]:
countries['continent'].unique()

array(['Asia', 'South America', 'North America', 'Africa', 'Europe'],
      dtype=object)

In [15]:
countries['population'] // 1000000

0      160.0
1      207.0
2     1376.0
3     1311.0
4      257.0
5      126.0
6      127.0
7      182.0
8      188.0
9      100.0
10     143.0
11     321.0
Name: population, dtype: float64

#### Number of rows and columns

#### Show the last 3 lines

In [27]:
countries

Unnamed: 0,population,fertility,continent,country
0,160995600.0,2.12,Asia,Bangladesh
1,207847500.0,1.78,South America,Brazil
2,1376049000.0,1.57,Asia,China
3,1311051000.0,2.43,Asia,India
4,257563800.0,2.28,Asia,Indonesia
5,126573500.0,1.45,Asia,Japan
6,127017200.0,2.13,North America,Mexico
7,182202000.0,5.89,Africa,Nigeria
8,188924900.0,3.04,Asia,Pakistan
9,100699400.0,2.98,Asia,Philippines


In [17]:
#selecting columns
countries["population"] #series(basicly is a list)
countries[["population","fertility"]] #dataframe

Unnamed: 0,population,fertility
0,160995600.0,2.12
1,207847500.0,1.78
2,1376049000.0,1.57
3,1311051000.0,2.43
4,257563800.0,2.28
5,126573500.0,1.45
6,127017200.0,2.13
7,182202000.0,5.89
8,188924900.0,3.04
9,100699400.0,2.98


In [18]:
#selecting rows by name
countries.loc[0]
#selecting rows by which element they are in the list
countries.iloc[:3]

Unnamed: 0,index,population,fertility,continent,country
0,Bangladesh,160995600.0,2.12,Asia,Bangladesh
1,Brazil,207847500.0,1.78,South America,Brazil
2,China,1376049000.0,1.57,Asia,China


In [28]:
#ask a qustion about the dataframe and only get true values back
#does the country column contain more than 5 characters for a given row?
countries[countries["country"].str.len()>5]

Unnamed: 0,population,fertility,continent,country
0,160995642.0,2.12,Asia,Bangladesh
1,207847528.0,1.78,South America,Brazil
4,257563815.0,2.28,Asia,Indonesia
6,127017224.0,2.13,North America,Mexico
7,182201962.0,5.89,Africa,Nigeria
8,188924874.0,3.04,Asia,Pakistan
9,100699395.0,2.98,Asia,Philippines
10,143456918.0,1.61,Europe,Russia
11,321773631.0,1.97,North America,United States


#### Summarize categorical data

#### Mean of a column

#### Summarize all numerical columns

#### Show the first 3 lines

#### Apply a calculation to each value in a column

#### Extract distinct values

## 4. Selecting rows and columns
Match the Python commands with the descriptions below. 

In [20]:
countries.columns

Index(['index', 'population', 'fertility', 'continent', 'country'], dtype='object')

In [21]:
countries.index

RangeIndex(start=0, stop=12, step=1)

In [22]:
countries['continent']

0              Asia
1     South America
2              Asia
3              Asia
4              Asia
5              Asia
6     North America
7            Africa
8              Asia
9              Asia
10           Europe
11    North America
Name: continent, dtype: object

In [23]:
countries[['population', 'continent']]

Unnamed: 0,population,continent
0,160995600.0,Asia
1,207847500.0,South America
2,1376049000.0,Asia
3,1311051000.0,Asia
4,257563800.0,Asia
5,126573500.0,Asia
6,127017200.0,North America
7,182202000.0,Africa
8,188924900.0,Asia
9,100699400.0,Asia


In [29]:
countries.loc['India']

KeyError: 'the label [India] is not in the [index]'

In [None]:
countries.iloc[3:7]

In [None]:
countries[countries['population'] > 200000000]

In [None]:
countries.values

#### Extract raw data as a NumPy array

#### Select rows by slicing the index

#### Filter rows by a condition

#### Display column labels

#### Select multiple columns

#### Display row index

#### Select row by an index value

#### Select one column

## 5. Summarizing Data
Match the Python commands with the descriptions below. 

In [None]:
countries['fertility'].cumsum()

In [None]:
countries.groupby('continent')['population'].sum()

In [None]:
countries.sort_values(by=['continent', 'fertility'])

In [None]:
def get_initial(s):
    return s[0]

countries['initial'] = countries['continent'].apply(get_initial)
countries

In [None]:
countries.stack()

In [None]:
countries.transpose()

In [None]:
countries['fertility'].hist()

In [None]:
countries.plot('population', 'fertility', style='ro')

#### Draw a scatterplot

#### Move columns to a new index level

#### Create a new column using a function

#### Draw a histogram

#### Cumulatively apply a sum over a column

#### Swap rows and columns

#### Calculate sum of one column grouped by a second one

#### Sort values

## License
(c) 2017 Kristian Rother
Distributed under the conditions of the MIT License.