In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
pd.set_option("display.max_rows", 8)
plt.rcParams['figure.figsize'] = (9, 6)

![pandas](http://pandas.pydata.org/_static/pandas_logo.png "Pandas Logo")


- Started by Wes MacKinney with a first release in 2011.
- Based on NumPy, it is the most used library for all things data.
- Motivated by the toolbox in R for manipulating data easily.
- A lot of names in Pandas come from R world.
- It is Open source (BSD)

https://pandas.pydata.org/

# Pandas 
```sh
conda install pandas
pip3 install pandas
```

```python
import pandas as pd
```

"*Pandas provides high-performance, easy-to-use data structures 
and data analysis tools in Python*"

- Self-describing data structures
- Data loaders to/from common file formats
- Plotting functions
- Basic statistical tools.


# Creation of [DataFrame](https://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe) 

In [None]:
dates = pd.date_range('20130101', periods=8)
pd.DataFrame(np.random.randn(8,4), index=dates, columns=list('ABCD'))

In [None]:
pd.DataFrame({'A' : 1.,
              'B' : pd.Timestamp('20180620'),
              'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
              'D' : np.arange(4,dtype='int32'),
              'E' : pd.Categorical(["test","train","test","train"]),
              'F' : 'foo' ,
              'G' : [ 3*n+1 for n in range(4)]})


# Load Data from CSV File

In [None]:
url = "https://www.fun-mooc.fr/c4x/agrocampusouest/40001S03/asset/AnaDo_JeuDonnees_TemperatFrance.csv"
french_cities = pd.read_csv(url, delimiter=";", encoding="latin1", index_col=0)
french_cities

# Viewing Data

In [None]:
french_cities.head()

In [None]:
french_cities.tail()

# Index

In [None]:
french_cities.index

We can rename an index by setting its name.

In [None]:
french_cities.index.name = "City"
french_cities.head()

# Exercise 
## Translate DataFrame names in English

In [None]:
french_cities.rename(columns={'Moye':'Mean'}, inplace=True)
french_cities.rename(columns={'Région':'Region'}, inplace=True)

In [None]:
import locale, calendar

locale.setlocale(locale.LC_ALL,'en_US')
months = list(calendar.month_abbr[1:])
months

In [None]:
french_cities.rename(
  columns={ old : new 
           for old, new in zip(french_cities.columns[:12], months)
           if old != new }, inplace=True)
french_cities.columns

In [None]:
french_cities

# Indexing on DataFrames

In [None]:
french_cities['Lati']  # DF [] accesses columns (Series)

In [None]:
french_cities.Lati

In [None]:
french_cities.values[:,12]

`.loc` and `.iloc` allow to access individual values, slices or masked selections:

In [None]:
french_cities.loc['Rennes', "Jun"]

In [None]:
french_cities.iloc[-4, 5]

In [None]:
french_cities.loc['Rennes', ["Jul", "Aug"]]

In [None]:
french_cities.iloc[-4, [6,7]]

In [None]:
french_cities.loc['Rennes', "Sep":"Dec"]

In [None]:
french_cities.iloc[-4, 8:12]

# Masking

In [None]:
mask = french_cities.Mean > 12
mask

In [None]:
french_cities[mask]

In [None]:
french_cities.loc[mask]

In [None]:
french_cities.iloc[mask.values]

In [None]:
french_cities[french_cities.Region == 'NO']

In [None]:
french_cities.loc[(french_cities.Region == 'NO') | (french_cities.Region == 'SO')]

# New column


In [None]:
french_cities["std"] = french_cities.iloc[:,:12].std(axis=1)
french_cities

In [None]:
french_cities = french_cities.drop("std", axis=1) # remove this new column

In [None]:
french_cities

# Modifying a dataframe with multiple indexing

In [None]:
# french_cities['Rennes']['Sep'] = 25 # It does not works and breaks the DataFrame
french_cities.loc['Rennes']['Jun'] # = 25 is the right way to do it

In [None]:
french_cities

# Transforming datasets

In [None]:
french_cities['Mean'].min(), french_cities['Ampl'].max()

## Apply

Let's convert the temperature mean from Celsius to Fahrenheit degree.

In [None]:
fahrenheit = lambda T: T*9/5+32
french_cities['Mean'].apply(fahrenheit)

## Sort

In [None]:
french_cities.sort_values(by='Lati')

In [None]:
french_cities = french_cities.sort_values(by='Lati',ascending=False)
french_cities

## Stack and unstack

Instead of seeing the months along the axis 1, and the cities along the axis 0, let's try to convert these into an outer and an inner axis along only 1 time dimension.

In [None]:
pd.set_option("display.max_rows", 20)
unstacked = french_cities.iloc[:,:12].unstack()
unstacked

In [None]:
type(unstacked)

## Transpose

The result is grouped in the wrong order since it sorts first the axis that was unstacked. We need to transpose the dataframe.

In [None]:
city_temp = french_cities.iloc[:,:12].transpose()
city_temp

In [None]:
city_temp[['Nantes','Rennes']].boxplot(rot=90);

# Describing

In [None]:
french_cities['Region'].describe()

In [None]:
french_cities['Region'].unique()

In [None]:
french_cities['Region'].value_counts()

In [None]:
french_cities.Region.dtypes

In [None]:
french_cities["Region"].memory_usage()

In [None]:
# To save memory, we can convert it to a categorical column:
french_cities["Region"] = french_cities["Region"].astype("category")
french_cities.Region.dtype

In [None]:
french_cities["Region"].memory_usage()

# Data Aggregation/summarization

## groupby

In [None]:
fc_grouped_region = french_cities.groupby("Region")
type(fc_grouped_region)

In [None]:
for group_name, subdf in fc_grouped_region:
    print(group_name)
    print(subdf)
    print("")

## Transferring R data sets into Python

In [None]:
%load_ext rpy2.ipython

- conversions of R to pandas objects will be done automatically

In [None]:
from rpy2.robjects import r
x = r('c(1,2,3,4)')
type(x)

In [None]:
v = r('seq(1:10)')
v

In [55]:
from rpy2.robjects import pandas2ri

pandas2ri.activate()
r.library('missMDA')
r.data('orange')
orange = r('orange')

  there is no package called ‘missMDA’



RRuntimeError: Error in (function (package, help, pos = 2, lib.loc = NULL, character.only = FALSE,  : 
  there is no package called ‘missMDA’


In [None]:
orange

In [None]:
%%R
library('missMDA')
data(orange)
estim_ncpPCA(orange)

In [None]:
from rpy2.robjects.packages import importr

miss_mda = importr('missMDA')
res = miss_mda.imputePCA(orange,ncp=2)
orange_r = res[0]
orange_r.colnames

In [None]:
orange = pd.DataFrame(pandas2ri.ri2py(orange_r), 
                      columns=orange_r.colnames, 
                      index=orange_r.rownames)

orange

In [None]:
from rpy2.robjects import r
r('library(missMDA)')
r('df <- imputePCA(orange,ncp=2) ')
r('res <- as.data.frame(df$completeObs)')
orange = r('res')
orange

# Load data from a local or remote HTML file
We can download and extract data about mean sea level stations around the world from the [PSMSL website](http://www.psmsl.org/).

In [None]:
# Needs `lxml`, `beautifulSoup4` and `html5lib` python packages
table_list = pd.read_html("http://www.psmsl.org/data/obtaining/")

In [None]:
# there is 1 table on that page which contains metadata about the stations where 
# sea levels are recorded
local_sea_level_stations = table_list[0]
local_sea_level_stations

# Saving Work

[HDF5](https://support.hdfgroup.org/HDF5/) is widely used and one of the most powerful file format to store binary data. It allows to store both Series and DataFrames.

In [None]:
with pd.HDFStore("data/pandas_nb.h5") as writer:
    local_sea_level_stations.to_hdf(writer, "/sea_level/stations")

In [None]:
%ls data/*.h5

# Reloading data

In [None]:
with pd.HDFStore("data/pandas_nb.h5") as store:
    local_sea_level_stations = store["/sea_level/stations"]

In [None]:
local_sea_level_stations

# References

- [Pandas website](http://pandas.pydata.org).
- *Python for Data Analysis* by Wes McKinney ([O'Reilly Media](http://shop.oreilly.com/product/0636920023784.do)).
- [Analyzing and Manipulating Data with Pandas Beginner](https://youtu.be/6ohWS7J1hVA) | SciPy 2016 Tutorial | Jonathan Rocher.
- https://github.com/groverpr/learn_python_libraries
- [Python Data Science Handbook](https://jakevdp.github.io/PythonDataScienceHandbook/)
