# Pandas User Guide

Studying Pandas and its API. 

## 01. Intro to Basic Structures

In [1]:
import pandas as pd
import numpy as np

### Series

In [2]:
#Creating a Panda Series from a np.array

array = np.random.randn(5)

index = ['a', 'b', 'c', 'd', 'e']

s = pd.Series(array, index)

s

a   -0.282120
b   -2.200607
c   -0.055148
d   -0.353520
e   -1.762315
dtype: float64

In [3]:
#Printing the indexes
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [4]:
#Building a series from a dict

d = {
    'a':2010,
    'b':2012,
    'c':2013,
    'd':2015
}

s = pd.Series(d)

s

a    2010
b    2012
c    2013
d    2015
dtype: int64

In [5]:
#Series behave like dictionary

s['a']

2010

In [6]:
#Cool thing, it can be vectorized. 
s * 2 

a    4020
b    4024
c    4026
d    4030
dtype: int64

In [7]:
s + 2 * s 

a    6030
b    6036
c    6039
d    6045
dtype: int64

In [8]:
d2 = {
    'a':2025,
    'b':2024,
    'c':2023,
    'd':2022
}

z = pd.Series(d2)

In [9]:
s - z

a   -15
b   -12
c   -10
d    -7
dtype: int64

In [10]:
#Numpy operations can be applied to it.
np.sqrt(z)

a    45.000000
b    44.988888
c    44.977772
d    44.966654
dtype: float64

In [11]:
#We can also use pandas Series as array-type.
#Interesting when we need to do operations without index

s.array

<NumpyExtensionArray>
[2010, 2012, 2013, 2015]
Length: 4, dtype: int64

In [12]:
#And we can send it to be a np.array
arr = z.to_numpy()
arr

array([2025, 2024, 2023, 2022])

In [13]:
"a" in z, "k" in z

(True, False)

In [14]:
#A difference between Series and ndarray is that operations between Series automatically align the data based on label. 
#Thus, you can write computations without giving consideration to whether the Series involved have the same labels.

s.iloc[:1] + s.iloc[:-1]

#This is interesting to compare dfs and check whether there is missing data or not.

a    4020.0
b       NaN
c       NaN
dtype: float64

### Dataframe

In [15]:
#Creating a dataframe

r = {
    "nom_reservatorio": ["14 DE JULHO", "ÁGUA VERMELHA", "AIMORÉS", "ANTA", "APOLÔNIO SALES"],
    "tip_reservatorio": ["FIO DAGUA", "RESERVATÓRIO COM USINA", "FIO DAGUA", "RESERVATÓRIO SEM USINA", "FIO DAGUA"],
    "cod_resplanejamento": [284, 18, 143, 128, 173],
    "cod_posto": [99, 18, 148, 129, 173],
    "nom_usina": ["14 DE JULHO", "ÁGUA VERMELHA", "AIMORÉS", "ANTA", "APOLÔNIO SALES"]

}

df = pd.DataFrame(r)

In [16]:
df

Unnamed: 0,nom_reservatorio,tip_reservatorio,cod_resplanejamento,cod_posto,nom_usina
0,14 DE JULHO,FIO DAGUA,284,99,14 DE JULHO
1,ÁGUA VERMELHA,RESERVATÓRIO COM USINA,18,18,ÁGUA VERMELHA
2,AIMORÉS,FIO DAGUA,143,148,AIMORÉS
3,ANTA,RESERVATÓRIO SEM USINA,128,129,ANTA
4,APOLÔNIO SALES,FIO DAGUA,173,173,APOLÔNIO SALES


In [17]:
#Get df index
display(df.index)

#Get column names
display(df.columns)

RangeIndex(start=0, stop=5, step=1)

Index(['nom_reservatorio', 'tip_reservatorio', 'cod_resplanejamento',
       'cod_posto', 'nom_usina'],
      dtype='object')

In [18]:
#Not to be extensive, but dfs can also be generated from:

# - list of dicts
# - structured or record array
# - dict of tuples
# - a series
# - list of named tuples (see below)
# - list of a dataclasses (see below)
# - alternate contructor

In [19]:
# List of Named Tuples

from collections import namedtuple

Point = namedtuple('Point', 'x y')

pd.DataFrame(
    [
        Point(0,0),
        Point(1,0),
        Point(2,2)
    ]
)

Unnamed: 0,x,y
0,0,0
1,1,0
2,2,2


In [20]:
Point3D = namedtuple('Point3D', 'x y z')

pd.DataFrame(
    [
        Point3D(0,1,2),
        Point3D(-1,2,0),
        Point3D(0,0,-2)
    ]
)

Unnamed: 0,x,y,z
0,0,1,2
1,-1,2,0
2,0,0,-2


In [21]:
#List of Dataclass

from dataclasses import make_dataclass

Point = make_dataclass('Point', [('x', int), ('y', int)])

pd.DataFrame([
    Point(0,0),
    Point(2,0),
    Point(1,2)
])

Unnamed: 0,x,y
0,0,0
1,2,0
2,1,2


#### Dataframe Operations

In [22]:
df

Unnamed: 0,nom_reservatorio,tip_reservatorio,cod_resplanejamento,cod_posto,nom_usina
0,14 DE JULHO,FIO DAGUA,284,99,14 DE JULHO
1,ÁGUA VERMELHA,RESERVATÓRIO COM USINA,18,18,ÁGUA VERMELHA
2,AIMORÉS,FIO DAGUA,143,148,AIMORÉS
3,ANTA,RESERVATÓRIO SEM USINA,128,129,ANTA
4,APOLÔNIO SALES,FIO DAGUA,173,173,APOLÔNIO SALES


In [23]:
# Selecting only one column by its name
df['nom_usina']

0       14 DE JULHO
1     ÁGUA VERMELHA
2           AIMORÉS
3              ANTA
4    APOLÔNIO SALES
Name: nom_usina, dtype: object

In [24]:
#We can create another column by summing numerics
df['calculus'] = df['cod_resplanejamento'] + df['cod_posto']
df

Unnamed: 0,nom_reservatorio,tip_reservatorio,cod_resplanejamento,cod_posto,nom_usina,calculus
0,14 DE JULHO,FIO DAGUA,284,99,14 DE JULHO,383
1,ÁGUA VERMELHA,RESERVATÓRIO COM USINA,18,18,ÁGUA VERMELHA,36
2,AIMORÉS,FIO DAGUA,143,148,AIMORÉS,291
3,ANTA,RESERVATÓRIO SEM USINA,128,129,ANTA,257
4,APOLÔNIO SALES,FIO DAGUA,173,173,APOLÔNIO SALES,346


In [25]:
#Do some filtering as well

df['region'] = df['cod_posto'] > 100
df

Unnamed: 0,nom_reservatorio,tip_reservatorio,cod_resplanejamento,cod_posto,nom_usina,calculus,region
0,14 DE JULHO,FIO DAGUA,284,99,14 DE JULHO,383,False
1,ÁGUA VERMELHA,RESERVATÓRIO COM USINA,18,18,ÁGUA VERMELHA,36,False
2,AIMORÉS,FIO DAGUA,143,148,AIMORÉS,291,True
3,ANTA,RESERVATÓRIO SEM USINA,128,129,ANTA,257,True
4,APOLÔNIO SALES,FIO DAGUA,173,173,APOLÔNIO SALES,346,True


In [26]:
# delete some undesired columns

del df['region']

In [27]:
df

Unnamed: 0,nom_reservatorio,tip_reservatorio,cod_resplanejamento,cod_posto,nom_usina,calculus
0,14 DE JULHO,FIO DAGUA,284,99,14 DE JULHO,383
1,ÁGUA VERMELHA,RESERVATÓRIO COM USINA,18,18,ÁGUA VERMELHA,36
2,AIMORÉS,FIO DAGUA,143,148,AIMORÉS,291
3,ANTA,RESERVATÓRIO SEM USINA,128,129,ANTA,257
4,APOLÔNIO SALES,FIO DAGUA,173,173,APOLÔNIO SALES,346
