In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94 , 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])

In [None]:
df

In [None]:
df.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]

In [None]:
df

In [None]:
df.columns

In [None]:
df.index

In [None]:
df.info()

In [None]:
df.size

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df.dtypes.value_counts()

## Indexing, Selection and Slicing

In [None]:
df

In [None]:
df.loc["France"]

In [None]:
df.iloc[]

In [None]:
df['Population']

In [None]:
df['Population'].to_frame()

Multiple columns can also be selected similarly to `numpy` and `Series`:

In [None]:
df[['Population', 'GDP']]

In this case, the result is another `DataFrame`. Slicing works differently, it acts at "row level", and can be counter intuitive:

In [None]:
df[1:3]

Row level selection works better with `loc` and `iloc` **which are recommended** over regular "direct slicing" (`df[:]`).

`loc` selects rows matching the given index:

In [None]:
df.loc['Italy']

In [None]:
df.loc['France': 'Italy']

As a second "argument", you can pass the column(s) you'd like to select:

In [None]:
df.loc['France': 'Italy', 'Population']

In [None]:
df.loc['France': 'Italy', ['Population', 'GDP']]

`iloc` works with the (numeric) "position" of the index:

In [None]:
df

In [None]:
df.iloc[0]

In [None]:
df.iloc[-1]

In [None]:
df.iloc[[0, 1, -1]]

In [None]:
df.iloc[1:3]

In [None]:
df.iloc[1:3, 3]

In [None]:
df.iloc[1:3, [0, 3]]

In [None]:
df.iloc[1:3, 1:3]

RECOMMENDED: Always use `loc` and `iloc` to reduce ambiguity, specially with DataFrames with numeric indexes

## Conditional selection (boolean arrays)

In [None]:
df

In [None]:
(df['Population'] > 70 ).value_counts()

In [None]:
df.loc[df['Population'] > 70]

In [None]:
df.loc[df['Population'] > 70, 'Population']

In [None]:
df.loc[df['Population'] > 70, ['Population', 'GDP']]

## Dropping

In [None]:
df.drop('Canada')

In [None]:
df.drop(['Canada', 'Japan'])

In [None]:
df.drop(columns=['Population', 'HDI'])

In [None]:
df.drop(['Italy', 'Canada'], axis=0)

In [None]:
df.drop(['Population', 'HDI'], axis=1)

In [None]:
df.drop(['Population', 'HDI'], axis=1)

In [None]:
df.drop(['Population', 'HDI'], axis='columns')

In [None]:
df.drop(['Canada', 'Germany'], axis='rows')

## Operations

In [None]:
df[['Population', 'GDP']]

In [None]:
df[['Population', 'GDP']] / 100

**Operations with Series** work at a column level, broadcasting down the rows (which can be counter intuitive).

In [None]:
crisis = pd.Series([-1_000_000, -0.3], index=['GDP', 'HDI'])
crisis

In [None]:
df[['GDP', 'HDI']]

In [None]:
df[['GDP', 'HDI']] + crisis

## Modifying DataFrames

### Adding a new column

In [None]:
langs = pd.Series(
    ['French', 'German', 'Italian'],
    index=['France', 'Germany', 'Italy'],
    name='Language'
)

In [None]:
langs

In [None]:
df['Language'] = langs

In [None]:
df

### Replacing values per column

In [None]:
df['Language'] = 'English'

In [None]:
df

### Renaming Columns


In [None]:
df.rename(
    columns={
        'HDI': 'Human Development Index',
        'Anual Popcorn Consumption': 'APC'
    }, index={
        'United States': 'USA',
        'United Kingdom': 'UK',
        'Argentina': 'AR'
    })

In [None]:
df.rename(index=str.upper)

### Dropping columns

In [None]:
df.drop(columns='Language', inplace=True)

### Adding values

In [None]:
df.append(pd.Series({
    'Population': 3,
    'GDP': 5
}, name='China'))

In [None]:
df

You can directly set the new index and values to the `DataFrame`:

In [None]:
df.loc['China'] = pd.Series({'Population': 1_400_000_000, 'Continent': 'Asia'})

In [None]:
df

We can use `drop` to just remove a row by index:

In [None]:
df.drop('China', inplace=True)

In [None]:
df

### More radical index changes

In [None]:
df.reset_index()

In [None]:
df.set_index('Population')

In [None]:
df

## Creating columns from other columns

In [None]:
df[['Population', 'GDP']]

In [None]:
df['GDP'] / df['Population']

In [None]:
df['GDP Per Capita'] = df['GDP'] / df['Population']

In [None]:
df

## Statistical info

In [None]:
df.head()
df.tail()

In [None]:
df.describe()

In [None]:
population = df['Population']

In [None]:
population.min(), population.max()

In [None]:
population.sum()

In [None]:
population.sum() / len(population)

In [None]:
population.mean()

In [None]:
population.std()

In [None]:
population.median()

In [None]:
population.describe()

In [None]:
population.quantile(.25)

In [None]:
population.quantile([.2, .4, .6, .8, 1])