<a href="https://colab.research.google.com/github/ralsouza/python_fundamentos/blob/master/src/06_Modulos_Analise_de_Dados/09_Pandas_Dataframes_Pandas_NumPy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Remember: NumPy isn't an analysis tool, it will work together with Pandas, Matplotlib, etc.

In [0]:
# Import Pandas and NumPy
import pandas as pd
import numpy as np

In [0]:
# Create a dictionary
data = {'State': ['Santa Catarina','Paraná','Goiás','Bahia','Minas Gerais'],
        'Year': [2002,2003,2004,2005,2006],
        'Population': [1.5,1.7,3.6,2.4,2.9]}

In [6]:
# Transform the data to DataFrame from above dictionary
frame = df(data)

# Show DataFrame
frame

Unnamed: 0,State,Year,Population
0,Santa Catarina,2002,1.5
1,Paraná,2003,1.7
2,Goiás,2004,3.6
3,Bahia,2005,2.4
4,Minas Gerais,2006,2.9


In [7]:
# Create another DataFrame, adding a custom index, defining the column names and add a new column
frame2 = df(data, columns=['Year','State','Population','Debit'],
                   index=['one','two','three','four','five'])

frame2

Unnamed: 0,Year,State,Population,Debit
one,2002,Santa Catarina,1.5,
two,2003,Paraná,1.7,
three,2004,Goiás,3.6,
four,2005,Bahia,2.4,
five,2006,Minas Gerais,2.9,


In [0]:
# Fill Debit column with a Numpy array
# Note that the number 5 is exclusive
frame2['Debit'] = np.arange(5.)

frame2

Unnamed: 0,Year,State,Population,Debit
one,2002,Santa Catarina,1.5,0.0
two,2003,Paraná,1.7,1.0
three,2004,Goiás,3.6,2.0
four,2005,Bahia,2.4,3.0
five,2006,Minas Gerais,2.9,4.0


In [0]:
# Show values
frame2.values

array([[2002, 'Santa Catarina', 1.5, 0.0],
       [2003, 'Paraná', 1.7, 1.0],
       [2004, 'Goiás', 3.6, 2.0],
       [2005, 'Bahia', 2.4, 3.0],
       [2006, 'Minas Gerais', 2.9, 4.0]], dtype=object)

In [0]:
# Summary with statistical measures
frame2.describe()

Unnamed: 0,Year,Population,Debit
count,5.0,5.0,5.0
mean,2004.0,2.42,2.0
std,1.581139,0.864292,1.581139
min,2002.0,1.5,0.0
25%,2003.0,1.7,1.0
50%,2004.0,2.4,2.0
75%,2005.0,2.9,3.0
max,2006.0,3.6,4.0


In [0]:
# Slicing by index name
frame2['two':'four']

Unnamed: 0,Year,State,Population,Debit
two,2003,Paraná,1.7,1.0
three,2004,Goiás,3.6,2.0
four,2005,Bahia,2.4,3.0


In [0]:
frame2 < 3

## Locating records into a DataFrame

In [0]:
# To locate a value that contais a criteria
frame2.loc['four']

Year           2005
State         Bahia
Population      2.4
Debit             3
Name: four, dtype: object

In [8]:
# iloc (index location), locate by the index number
frame2.iloc[2]

Year           2004
State         Goiás
Population      3.6
Debit           NaN
Name: three, dtype: object

## Inverting columns and indexes

In [0]:
# Create a dictionary
web_stats = {'Days': [1,2,3,4,5,6,7],
             'Visitors':[45,23,67,78,23,12,14],
             'rate':[11,22,33,44,55,66,77]}

In [18]:
df = pd.DataFrame(web_stats)

df

Unnamed: 0,Days,Visitors,rate
0,1,45,11
1,2,23,22
2,3,67,33
3,4,78,44
4,5,23,55
5,6,12,66
6,7,14,77


In [19]:
# As we can see, the column Days are in sequence
# Therefore, we can transform this column to an index
df.set_index('Days')

Unnamed: 0_level_0,Visitors,rate
Days,Unnamed: 1_level_1,Unnamed: 2_level_1
1,45,11
2,23,22
3,67,33
4,78,44
5,23,55
6,12,66
7,14,77


In [20]:
# The instruction above, doesn't change de Dataframe structure
# As we can see bellow, the Dataframe remains original
df.head()

Unnamed: 0,Days,Visitors,rate
0,1,45,11
1,2,23,22
2,3,67,33
3,4,78,44
4,5,23,55


In [22]:
# Slicing by Visitors column
print(df['Visitors'])

0    45
1    23
2    67
3    78
4    23
5    12
6    14
Name: Visitors, dtype: int64


In [24]:
# Now, slicing by Visitors and Rate
# Note that the double bracktes
print(df[['Visitors','rate']])

   Visitors  rate
0        45    11
1        23    22
2        67    33
3        78    44
4        23    55
5        12    66
6        14    77
