# Summary of EDA

This notebook contains a list of Python methods that were introduced in the EDA programming exercises.

### Import the libraries

In [None]:
import numpy as np
import pandas as pd

### Data frames


In [None]:
df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': ['one', 'two', 'three', 'four'], 'C': [False, True, False, True]})
df

In [None]:
df.shape

In [None]:
df.index

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.info()

### `Index` and `Series` objects

In [None]:
names = ['Peter', 'Anna', 'Tom', 'John', 'Simone']
years = [ 1998, 2002, 1946, 1973, 1962 ]

In [None]:
se_years = pd.Series( years, index=names, name='Years of birth' )
se_years

### Types

In [None]:
type(df) 

In [None]:
type(df['C']) 

In [None]:
df.dtypes['C']

In [None]:
type(se_years)

### Getting data from a `DataFrame`

In [None]:
df['B']

In [None]:
df.loc[2]

In [None]:
df['B'][2]

In [None]:
df.loc[2, 'B']

In [None]:
df.loc[1:2]

In [None]:
df.loc[:]

In [None]:
df.head(2)

In [None]:
df.loc[:, 'A':'B']

In [None]:
df.loc[1:2, 'A':'B']

In [None]:
df.loc[[1, 3]]

In [None]:
df[['A', 'C']]

In [None]:
df.loc[:, ['A', 'C']]

In [None]:
df[['B']]

### Modifying a `DataFrame`


In [None]:
df.loc[0, 'B'] = 'ACE'
df

In [None]:
df_slice = df.loc[1:2].copy()
df_slice.loc[1, 'A'] = 8
df_slice.loc[1, 'B'] = "EIGHT"
df_slice

### Setting the index of a `DataFrame`


In [None]:
df_index = df.set_index('B')

In [None]:
df_index.loc['two':'four']

In [None]:
df_index.loc['three', 'C'] = True
df_index

### Reading data from a file into a `DataFrame`

In [None]:
country_data = pd.read_csv('../datasets/country.csv')
country_data.head()

### Summary statistics


In [None]:
country_data.describe()

In [None]:
country_data.describe(include='all')

In [None]:
country_data[['area', 'population']].sum()

In [None]:
country_data.count()

In [None]:
country_data.sum()

In [None]:
country_data.mean()

In [None]:
country_data.std()

In [None]:
country_data.min()

In [None]:
country_data.max()

In [None]:
country_data.median()

In [None]:
country_data.quantile(0.25)

### Sorting a `DataFrame`


In [None]:
country_data.sort_values(by='area');

In [None]:
country_data.sort_values(by='area', ascending=False);

In [None]:
country_data.sort_values(by=['continent', 'name']);

In [None]:
country_data_sorted_by_pop = country_data.sort_values(by='population', ascending=False).reset_index()

### Computing values from a `DataFrame`

In [None]:
country_data['population'] / country_data['area'];

In [None]:
country_data['density'] = country_data['population'] / country_data['area']

In [None]:
countries_in_EU = country_data['continent'] == 'EU'

### Selecting rows by boolean masks

In [None]:
country_data[countries_in_EU];

In [None]:
country_data[country_data['tld'] == '.nl'];

In [None]:
country_data[country_data['tld'].isnull()];

In [None]:
country_data[(country_data['population'] < 1000000) & (country_data['area'] > 100000)];

### Grouping rows in a `DataFrame`

In [None]:
grouped = country_data.groupby('continent')

In [None]:
grouped.groups;

In [None]:
grouped['area'].describe()

In [None]:
grouped[['population', 'area']].sum()

### Deleting rows or columns from a `DataFrame`

In [None]:
df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': ['one', 'two', 'three', 'four'], 'C': [False, True, False, True]})
df

In [None]:
df.drop(1)

In [None]:
df.drop('C', axis=1)

In [None]:
country_data.drop(country_data[countries_in_EU].index);

### Persisting the modifications


In [None]:
df_drop = df.drop('C', axis=1)
df_drop

In [None]:
df_drop.loc[1, 'B'] = 'EIGHT'
df_drop

In [None]:
df.drop([1, 3], inplace=True)  # drop rows with index labels 1 and 3, modifying df in place
df

### Plotting a `DataFrame`

In [None]:
# next command ensures that plots appear inside the notebook
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()  # set Seaborn defaults
plt.rcParams['figure.figsize'] = 10, 5  # default hor./vert. size of plots, in inches
plt.rcParams['lines.markeredgewidth'] = 1  # to fix issue with seaborn box plots; needed after import seaborn

### Line plot


In [None]:
country_data['population'].plot();

In [None]:
country_data_sorted_by_pop['population'].plot();

### Area plot


In [None]:
country_data['population'].plot(kind='area');

In [None]:
country_data_sorted_by_pop['population'].plot(kind='area');

### Dot plot


In [None]:
country_data['population'].plot(marker="o", linestyle='');

### Histogram


In [None]:
country_data[countries_in_EU][['population']].plot(kind='hist');

In [None]:
country_data[countries_in_EU][['population']].plot(kind='hist', bins=20, density=True);

### Density plot


In [None]:
country_data[countries_in_EU][['population']].plot(kind='density');

### Box plot


In [None]:
country_data[countries_in_EU][['population']].plot(kind='box');

In [None]:
country_data.boxplot(column='population', by='continent');

### Bar plot


In [None]:
grouped['continent'].count().plot(kind='bar');

In [None]:
grouped['population'].sum().plot(kind='bar');

### Scatter plot


In [None]:
country_data.plot(kind='scatter', x='area', y='population', c="b");