# Principais bibliotecas

In [None]:
# Análise de Dados
import numpy as np
import pandas as pd
import statsmodels as sm
from scipy.stats import norm

# Visualização dos dados
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.linear_model import LinearRegression

# Definições de variáveis

In [None]:
a = 5
b = 10

a + b

15

In [None]:
type(a)

int

In [None]:
c = 10.5
type(c)

float

In [None]:
nome = "rafael"
type(nome)
nome.capitalize()
nome.upper()

'RAFAEL'

In [None]:
sim = True
type(sim)

bool

# Convertendo variáveis

In [None]:
valor_string = "6.59"
type(valor_string)

str

In [None]:
valor_float = float(valor_string)
type(valor_float)

float

# Manipulação de datas

In [None]:
from datetime import datetime, date, time

In [None]:
dt = datetime(2024, 2, 2, 19, 53, 0)
dt

datetime.datetime(2024, 2, 2, 19, 53)

In [None]:
type(dt)

datetime.datetime

In [None]:
dt.day

2

In [None]:
dt.date()

datetime.date(2024, 2, 2)

In [None]:
dt.time()

datetime.time(19, 53)

In [None]:
dt2 = datetime.strptime("20240202", "%Y%m%d")
dt2

datetime.datetime(2024, 2, 2, 0, 0)

In [None]:
dt.strftime("%d/%m/%Y %H:%M:%S")

'02/02/2024 19:53:00'

In [None]:
a = 5
b = 10
a ** b

9765625

# Trabalhando com módulos

In [None]:
from modulo_calculo import soma

In [None]:
soma(15, 80, False)

1200

# Estruturas de Dados Embutidas - Tuplas

In [None]:
tupla = 1, 3, 4
tupla

(1, 3, 4)

In [None]:
tupla = (1, 2, 3), (4, 5, 6)
tupla

((1, 2, 3), (4, 5, 6))

In [None]:
tupla = tuple([1, 3, 5, 7, 9])
tupla

(1, 3, 5, 7, 9)

In [None]:
tupla = tuple('string')
tupla

('s', 't', 'r', 'i', 'n', 'g')

In [None]:
tupla[1]

't'

In [None]:
tupla = tuple(['teste', [1, 2], True])
tupla

('teste', [1, 2], True)

In [None]:
tupla[1].append(3)
tupla

('teste', [1, 2, 3, 3], True)

# Desempacotando tuplas

In [None]:
tupla = 1, 2, 3
a, b, c = tupla
print(a, b, c)

1 2 3


In [None]:
a, b = 15, 89
a

15

# Métodos de tupla

In [None]:
tupla = 1, 2, 2, 3, 4, 4, 5
tupla.count(2)

2

# Dicionários

In [None]:
dicionario = {"key": 1, "key2": 6, "key3": [1, 2, 3]}
dicionario['key']

1

In [None]:
dicionario["key2"] = 67
dicionario

{'key': 1, 'key2': 67, 'key3': [1, 2, 3]}

In [None]:
del dicionario["key3"]
dicionario

{'key': 1, 'key2': 67}

In [None]:
dicionario.keys()

dict_keys(['key', 'key2'])

In [None]:
dicionario.values()

dict_values([1, 67])

# Criando dicionários a partir de sequências

In [None]:
key_list = ["1", "2", "3"]
value_list = ["A", "B", "C"]

dicionario_final = {}

for key, val in zip(key_list, value_list):
  dicionario_final[key] = val

dicionario_final['2']

'B'

# Retornando diversos objetos

In [None]:
def funcao_retorna_varios_objetos():
  a = 5
  b = 6
  c = 7

  return a, b, c

funcao_retorna_varios_objetos()

(5, 6, 7)

In [None]:
def funcao_retorna_dicionario():
  a = 5
  b = 6
  c = 7

  return {'a': a, 'b': b, 'c': c}

funcao_retorna_dicionario()

{'a': 5, 'b': 6, 'c': 7}

# Métodos e bibliotecas

In [None]:
estados = [' Alabama', 'Georgia!', 'Georgia', 'georgia', 'FlOrida', 'south carolina##', 'West Virginia?']

In [None]:
import re

def limpar_strings(strings):
  result = []

  for value in strings:
    value = value.strip()
    value = re.sub('[!#?]', '', value)
    value = value.title()

    result.append(value)

  return result

limpar_strings(estados)

['Alabama',
 'Georgia',
 'Georgia',
 'Georgia',
 'Florida',
 'South Carolina',
 'West Virginia']

In [None]:
def remover_pontuacao(valor):
  return re.sub('[!#?]', '', valor)

ops = [str.strip, remover_pontuacao, str.title]

def limpar_strs(strings, ops):
  result = []

  for valor in strings:
    for function in ops:
      valor = function(valor)

    result.append(valor)

  return result

limpar_strs(estados, ops)

['Alabama',
 'Georgia',
 'Georgia',
 'Georgia',
 'Florida',
 'South Carolina',
 'West Virginia']

In [None]:
for x in map(remover_pontuacao, estados):
  print(x)

 Alabama
Georgia
Georgia
georgia
FlOrida
south carolina
West Virginia


# Funções anônimas lambda

In [None]:
def minha_funcao(x):
  return x * 2

equivalente_lambda = lambda x: x * 2
equivalente_lambda

<function __main__.<lambda>(x)>

In [None]:
def adiciona_na_lista(lista, f):
  return [f(x) for x in lista]

ints = [4, 0, 1, 3, 5]
adiciona_na_lista(ints, lambda x: x * 2)

[8, 0, 2, 6, 10]

In [None]:
strings = ['aaaa', 'bb', 'abdc', 'e']

In [None]:
strings.sort(key = lambda x: len(set(list(x))))
strings

['aaaa', 'bb', 'e', 'abdc']

# Erros e tratamentos de exceção

In [None]:
def cast_float(x):
  try:
    return float(x)
  except (ValueError, TypeError):
    return x

'teste'

In [None]:
cast_float('1.123')

1.123

In [None]:
cast_float('teste')

'teste'

In [None]:
cast_float((1, 2))

(1, 2)

# Arquivos e sistemas operacionais

In [None]:
path = "texto_python.txt"
f = open(path)
f

<_io.TextIOWrapper name='texto_python.txt' mode='r' encoding='UTF-8'>

In [None]:
for linha in f:
  print(linha)
  pass

Python é uma linguagem de programação de alto nível interpretada de script,

imperativa, orientada a objetos, funcional, de tipagem dinâmica e forte.

Foi lançada por Guido van Rossum em 1991.



Atualmente, possui um modelo de desenvolvimento comunitário, aberto e

gerenciado pela organização sem fins lucrativos Python Software Foundation.



Apesar de várias partes da linguagem possuírem padrões e especificações 

formais, a linguagem, como um todo, não é formalmente especificada.

O padrão de fato é a implementação CPython.


In [None]:
linhas = [x.rstrip() for x in open(path)]
linhas

['Python é uma linguagem de programação de alto nível interpretada de script,',
 'imperativa, orientada a objetos, funcional, de tipagem dinâmica e forte.',
 'Foi lançada por Guido van Rossum em 1991.',
 '',
 'Atualmente, possui um modelo de desenvolvimento comunitário, aberto e',
 'gerenciado pela organização sem fins lucrativos Python Software Foundation.',
 '',
 'Apesar de várias partes da linguagem possuírem padrões e especificações',
 'formais, a linguagem, como um todo, não é formalmente especificada.',
 'O padrão de fato é a implementação CPython.']

In [None]:
f.close()
f.closed

True

In [None]:
with open(path) as arquivo:
  linhas2 = [x.rstrip() for x in arquivo]

linhas2

['Python é uma linguagem de programação de alto nível interpretada de script,',
 'imperativa, orientada a objetos, funcional, de tipagem dinâmica e forte.',
 'Foi lançada por Guido van Rossum em 1991.',
 '',
 'Atualmente, possui um modelo de desenvolvimento comunitário, aberto e',
 'gerenciado pela organização sem fins lucrativos Python Software Foundation.',
 '',
 'Apesar de várias partes da linguagem possuírem padrões e especificações',
 'formais, a linguagem, como um todo, não é formalmente especificada.',
 'O padrão de fato é a implementação CPython.']

In [None]:
with open('texto_novo_python.txt', 'w') as arquivo2:
  arquivo2.write('Esse é o meu arquivo de teste criado com Python!')

# Numpy

In [None]:
import numpy as np

In [None]:
meu_array = np.arange(1000000)
minha_lista = list(range(1000000))

In [None]:
%time for _ in range(10): meu_array2 = meu_array * 2

CPU times: user 16.6 ms, sys: 0 ns, total: 16.6 ms
Wall time: 18.1 ms


In [None]:
%time for _ in range(10): minha_lista2 = [x * 2 for x in minha_lista]

CPU times: user 1.13 s, sys: 155 ms, total: 1.28 s
Wall time: 1.33 s


# Ndarray

In [None]:
import numpy as np

data = np.random.randn(2, 3)
data

array([[0.16626659, 1.01967205, 1.13543246],
       [0.24958961, 0.30466958, 1.15415381]])

In [None]:
data * 10

array([[ 1.66266588, 10.19672054, 11.35432456],
       [ 2.49589612,  3.04669579, 11.54153809]])

In [None]:
data + data

array([[0.33253318, 2.03934411, 2.27086491],
       [0.49917922, 0.60933916, 2.30830762]])

# Shape e Dtype

In [None]:
data.shape

(2, 3)

In [None]:
data.dtype

dtype('float64')

# Aritmética com Arrays NumPy

In [None]:
import numpy as np

In [None]:
arr = np.array([[1., 2., 3. ], [4., 5., 6.]])
arr

array([[1., 2., 3.],
       [4., 5., 6.]])

In [None]:
arr * arr

array([[ 1.,  4.,  9.],
       [16., 25., 36.]])

In [None]:
arr - arr

array([[0., 0., 0.],
       [0., 0., 0.]])

In [None]:
1 / arr

array([[1.        , 0.5       , 0.33333333],
       [0.25      , 0.2       , 0.16666667]])

In [None]:
arr ** 0.5

array([[1.        , 1.41421356, 1.73205081],
       [2.        , 2.23606798, 2.44948974]])

In [None]:
arr2 = np.array([[0., 4., 1.], [7., 2., 12.]])
arr2

array([[ 0.,  4.,  1.],
       [ 7.,  2., 12.]])

In [None]:
arr > arr2

array([[ True, False,  True],
       [False,  True, False]])

# Indexação básica e fateamento

In [None]:
import numpy as np

In [None]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [None]:
arr[5]

5

In [None]:
arr[5:8]

array([5, 6, 7])

In [None]:
arr[5:8] = 12
arr

array([ 0,  1,  2,  3,  4, 12, 12, 12,  8,  9])

# Views

In [None]:
arr_slice = arr[5:8]
arr_slice

12

In [None]:
arr_slice[1] = 64
arr_slice

array([12, 64, 12])

In [None]:
arr

array([ 0,  1,  2,  3,  4, 12, 64, 12,  8,  9])

In [None]:
arr_slice[:] = 64
arr_slice

array([64, 64, 64])

In [None]:
arr

array([ 0,  1,  2,  3,  4, 64, 64, 64,  8,  9])

# Criando cópias

In [None]:
arr_copia = arr[5:8].copy()
arr_copia

array([64, 64, 64])

In [None]:
arr_copia[:] = 128
arr_copia
arr

array([ 0,  1,  2,  3,  4, 64, 64, 64,  8,  9])

# Fatiamento em Arrays Multidimensionais

In [None]:
import numpy as np

In [None]:
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
arr2d

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [None]:
arr2d[0][2]

3

In [None]:
arr2d[0,2]

3

In [None]:
arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7,8,9], [10, 11, 12]]])
arr3d

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [None]:
arr3d[0]

array([[1, 2, 3],
       [4, 5, 6]])

In [None]:
old_values = arr3d[0].copy()
old_values

array([[1, 2, 3],
       [4, 5, 6]])

In [None]:
arr3d[0] = 42
arr3d

array([[[42, 42, 42],
        [42, 42, 42]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [None]:
arr3d[0] = old_values
arr3d

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

# Indexação com fatias

In [None]:
import numpy as np

In [None]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [None]:
arr[:4]

array([0, 1, 2, 3])

# Indexação booleana

In [None]:
nomes = np.array(['Maria', 'José', 'João'])
data = np.random.randn(3, 4)
data

array([[-0.97995866, -0.41294355,  1.09517432, -0.34900146],
       [-1.09250208,  0.44616518, -0.32483068,  0.46664739],
       [ 0.61411169,  0.53189782,  0.5624679 , -1.06531089]])

In [None]:
nomes == 'Maria'

array([ True, False, False])

In [None]:
data[nomes == 'Maria']

array([[-0.97995866, -0.41294355,  1.09517432, -0.34900146]])

In [None]:
data[~(nomes == 'José')]

array([[-0.97995866, -0.41294355,  1.09517432, -0.34900146],
       [ 0.61411169,  0.53189782,  0.5624679 , -1.06531089]])

In [None]:
data[data > 0]

array([1.09517432, 0.44616518, 0.46664739, 0.61411169, 0.53189782,
       0.5624679 ])

In [None]:
selecao1 = (nomes == 'José') | (nomes == 'Maria')
data[selecao1]

array([[-0.97995866, -0.41294355,  1.09517432, -0.34900146],
       [-1.09250208,  0.44616518, -0.32483068,  0.46664739]])

# Transposição de arrays e troca de eixos

In [None]:
import numpy as np

In [None]:
arr = np.arange(15).reshape((3, 5))
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [None]:
arr.T

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

In [None]:
arr = np.arange(16).reshape((2, 2, 4))
arr

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])

In [None]:
arr[0]

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

In [None]:
arr.transpose((1, 0, 2))

array([[[ 0,  1,  2,  3],
        [ 8,  9, 10, 11]],

       [[ 4,  5,  6,  7],
        [12, 13, 14, 15]]])

# Vetorização - Programação orientada a arrays e lógica condicional

In [None]:
import numpy as np

In [None]:
x_arr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
y_arr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
cond = np.array([True, False, True, True, False])

In [None]:
resultado = [(x if c else y) for x, y, c in zip(x_arr, y_arr, cond)]
resultado

[1.1, 2.2, 1.3, 1.4, 2.5]

In [None]:
resultado2 = np.where(cond, x_arr, y_arr)
resultado2

array([1.1, 2.2, 1.3, 1.4, 2.5])

In [None]:
arr = np.random.randn(4, 4)
arr

array([[ 1.43945338,  0.44085928, -0.16837832, -0.19772391],
       [-1.06321887,  0.30988008, -0.54368723, -0.43773084],
       [-0.75761493,  0.27889073, -0.81513356,  0.24836226],
       [-0.65850475,  0.34709213,  2.27165908, -0.37016094]])

In [None]:
arr > 0

array([[ True,  True, False, False],
       [False,  True, False, False],
       [False,  True, False,  True],
       [False,  True,  True, False]])

In [None]:
np.where(arr > 0, 2, -2)

array([[ 2,  2, -2, -2],
       [-2,  2, -2, -2],
       [-2,  2, -2,  2],
       [-2,  2,  2, -2]])

# Métodos matemáticos e estatitícos com NumPy

In [None]:
import numpy as np

In [None]:
arr = np.random.randn(5, 4)
arr

array([[ 0.83910503,  1.07537709, -1.33369843, -0.87101611],
       [ 0.54259882, -1.84086613, -0.24160582,  1.60173509],
       [ 0.63187085,  0.4178263 , -0.49270787, -0.67224879],
       [ 0.97118568,  0.55247182, -0.70702909, -1.33014683],
       [ 0.06301932, -0.26104731,  0.68785716,  0.08069104]])

In [None]:
arr.mean()

-0.014331409090758784

In [None]:
np.mean(arr)

-0.014331409090758784

In [None]:
arr.sum()

-0.2866281818151757

In [None]:
arr.max()

1.6017350859014812

In [None]:
arr = np.array([0, 1, 2, 3, 4, 5, 6, 7])
arr

array([0, 1, 2, 3, 4, 5, 6, 7])

In [None]:
arr.cumsum()

array([ 0,  1,  3,  6, 10, 15, 21, 28])

# Ordenação, unidade e outras lógicas

In [None]:
import numpy as np

In [None]:
arr = np.random.randn(6)
arr

array([ 0.26986728,  0.38606375, -0.8193524 , -0.54928886, -0.56528204,
       -0.82945066])

In [None]:
arr.sort()
arr

array([-0.82945066, -0.8193524 , -0.56528204, -0.54928886,  0.26986728,
        0.38606375])

In [None]:
-np.sort(arr)

array([ 0.82945066,  0.8193524 ,  0.56528204,  0.54928886, -0.26986728,
       -0.38606375])

In [None]:
arr = np.array(['Maria', 'Maria', 'João', 'Pedro', 'Pedro', 'José'])
np.unique(arr)

array(['José', 'João', 'Maria', 'Pedro'], dtype='<U5')

In [None]:
arr = np.array([1, 3, 4, 4, 5, 6, 7, 7, 1])
np.unique(arr)

array([1, 3, 4, 5, 6, 7])

# Álgebra Linear

In [None]:
import numpy as np

In [None]:
x = np.array([[1., 2., 3.], [4., 5., 6.]])
y = np.array([[6., 23.], [-1, 7], [8, 9]])

In [None]:
x

array([[1., 2., 3.],
       [4., 5., 6.]])

In [None]:
y

array([[ 6., 23.],
       [-1.,  7.],
       [ 8.,  9.]])

In [None]:
x.dot(y)

array([[ 28.,  64.],
       [ 67., 181.]])

In [None]:
np.dot(x, y)

array([[ 28.,  64.],
       [ 67., 181.]])

In [None]:
x @ y

array([[ 28.,  64.],
       [ 67., 181.]])

In [None]:
from numpy.linalg import inv, qr

In [None]:
X = np.random.randn(5, 5)

In [None]:
mat = X.T.dot(X)

In [None]:
mat

array([[ 4.62106102, -0.5514065 ,  0.32722938,  2.36986224,  1.62173877],
       [-0.5514065 ,  3.12993105,  0.22872611, -1.1593148 , -1.71185194],
       [ 0.32722938,  0.22872611,  2.13344221,  1.16216613, -0.01546612],
       [ 2.36986224, -1.1593148 ,  1.16216613,  2.37382535,  1.51311416],
       [ 1.62173877, -1.71185194, -0.01546612,  1.51311416,  2.54690827]])

In [None]:
inv(mat)

array([[ 0.75754908, -0.3593459 ,  0.5900497 , -1.22565616,  0.00784865],
       [-0.3593459 ,  0.70397441, -0.42683518,  0.74957335,  0.25406239],
       [ 0.5900497 , -0.42683518,  1.3312717 , -1.66112588,  0.33235496],
       [-1.22565616,  0.74957335, -1.66112588,  3.23834157, -0.64973638],
       [ 0.00784865,  0.25406239,  0.33235496, -0.64973638,  0.94642368]])

In [None]:
mat.dot(inv(mat))

array([[ 1.00000000e+00,  1.16504685e-16, -4.30872730e-16,
         5.72244819e-16,  8.75778368e-17],
       [-2.50433033e-16,  1.00000000e+00,  9.27999537e-17,
         6.55680559e-16, -2.54019449e-16],
       [ 1.40605613e-16, -1.23435659e-16,  1.00000000e+00,
        -1.13050186e-16,  1.49240493e-16],
       [-1.33304264e-16,  1.50551744e-16, -3.51501838e-16,
         1.00000000e+00, -5.83017707e-17],
       [-1.19421639e-16,  2.62383193e-16, -4.81895581e-16,
         4.32872368e-16,  1.00000000e+00]])

In [None]:
q, r = qr(mat)

In [None]:
q

array([[-0.84352315, -0.2815955 ,  0.22904011,  0.39581818,  0.00642352],
       [ 0.10065311, -0.87883369,  0.00576449, -0.41743491,  0.20793068],
       [-0.05973207, -0.09967758, -0.90059052,  0.31850473,  0.27200718],
       [-0.43259192,  0.1165995 , -0.36398303, -0.6196909 , -0.53175966],
       [-0.29603033,  0.35330477,  0.06293219, -0.42850203,  0.77457558]])

In [None]:
r

array([[-5.47828594,  1.77476945, -0.87860373, -3.65996624, -2.94787664],
       [ 0.        , -3.35819511, -0.37577116,  1.04703775,  2.12556365],
       [ 0.        ,  0.        , -2.26907277, -1.27933371, -0.01496145],
       [ 0.        ,  0.        ,  0.        , -0.327282  , -0.67744401],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.81842371]])

# Series e Dataframes

In [20]:
import pandas as pd

In [3]:
lista = [4, 7, -5, 3]
lista

[4, 7, -5, 3]

In [4]:
obj = pd.Series(lista)
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [5]:
obj.values

array([ 4,  7, -5,  3])

In [6]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
obj[1]

7

In [8]:
obj[obj > 2]

0    4
1    7
3    3
dtype: int64

In [9]:
obj * 2

0     8
1    14
2   -10
3     6
dtype: int64

In [10]:
sdata = {'São Paulo': 8000, 'Bahia': 4544, 'Rio de Janeiro': 7899, 'Minas Gerais': 9099}
sdata

{'São Paulo': 8000,
 'Bahia': 4544,
 'Rio de Janeiro': 7899,
 'Minas Gerais': 9099}

In [11]:
obj2 = pd.Series(sdata)
obj2

São Paulo         8000
Bahia             4544
Rio de Janeiro    7899
Minas Gerais      9099
dtype: int64

In [12]:
estados = ['Bahia', 'Rio de Janeiro', 'Rio Grande do Sul', 'São Paulo']
estados

['Bahia', 'Rio de Janeiro', 'Rio Grande do Sul', 'São Paulo']

In [13]:
obj2 = pd.Series(sdata, index=estados)
obj2

Bahia                4544.0
Rio de Janeiro       7899.0
Rio Grande do Sul       NaN
São Paulo            8000.0
dtype: float64

In [14]:
pd.isnull(obj2)

Bahia                False
Rio de Janeiro       False
Rio Grande do Sul     True
São Paulo            False
dtype: bool

In [15]:
pd.notnull(obj2)

Bahia                 True
Rio de Janeiro        True
Rio Grande do Sul    False
São Paulo             True
dtype: bool

In [16]:
~pd.isnull(obj2)

Bahia                 True
Rio de Janeiro        True
Rio Grande do Sul    False
São Paulo             True
dtype: bool

In [17]:
obj2.isnull()

Bahia                False
Rio de Janeiro       False
Rio Grande do Sul     True
São Paulo            False
dtype: bool

# Dataframe

In [18]:
data = {
    'estados': ['SP', 'SP', 'SP', 'MG', 'MG', 'MG'],
    'ano': [2021, 2022, 2023, 2021, 2022, 2023],
    'taxa': [1.5, 1.8, 2.5, 1.9, 1.6, 4.1]
}
data

{'estados': ['SP', 'SP', 'SP', 'MG', 'MG', 'MG'],
 'ano': [2021, 2022, 2023, 2021, 2022, 2023],
 'taxa': [1.5, 1.8, 2.5, 1.9, 1.6, 4.1]}

In [22]:
analise = pd.DataFrame(data)

In [23]:
analise.head()

Unnamed: 0,estados,ano,taxa
0,SP,2021,1.5
1,SP,2022,1.8
2,SP,2023,2.5
3,MG,2021,1.9
4,MG,2022,1.6


In [24]:
analise.tail()

Unnamed: 0,estados,ano,taxa
1,SP,2022,1.8
2,SP,2023,2.5
3,MG,2021,1.9
4,MG,2022,1.6
5,MG,2023,4.1


In [26]:
analise = pd.DataFrame(data, columns=['ano', 'estados', 'taxa'])
analise.head()

Unnamed: 0,ano,estados,taxa
0,2021,SP,1.5
1,2022,SP,1.8
2,2023,SP,2.5
3,2021,MG,1.9
4,2022,MG,1.6



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [27]:
analise = pd.DataFrame(data, columns=['ano', 'estados', 'taxa', 'débito'])
analise.head()

Unnamed: 0,ano,estados,taxa,débito
0,2021,SP,1.5,
1,2022,SP,1.8,
2,2023,SP,2.5,
3,2021,MG,1.9,
4,2022,MG,1.6,


In [32]:
analise.columns = ['ano', 'estado', 'taxa', 'debto']
analise.head()

Unnamed: 0,ano,estado,taxa,debto
0,2021,SP,1.5,
1,2022,SP,1.8,
2,2023,SP,2.5,
3,2021,MG,1.9,
4,2022,MG,1.6,


In [34]:
analise['estado']

0    SP
1    SP
2    SP
3    MG
4    MG
5    MG
Name: estado, dtype: object

In [35]:
del analise['debto']

In [36]:
analise.columns

Index(['ano', 'estado', 'taxa'], dtype='object')

In [37]:
analise

Unnamed: 0,ano,estado,taxa
0,2021,SP,1.5
1,2022,SP,1.8
2,2023,SP,2.5
3,2021,MG,1.9
4,2022,MG,1.6
5,2023,MG,4.1


In [38]:
dados = {
    'São Paulo': {
        2022: 5.6,
        2023: 5.8
    },
    'Minas Gerais': {
        2022: 4.5,
        2023: 5.0
    }
}

dados

{'São Paulo': {2022: 5.6, 2023: 5.8}, 'Minas Gerais': {2022: 4.5, 2023: 5.0}}

In [49]:
analise2 = pd.DataFrame(dados)
analise2.head()

Unnamed: 0,São Paulo,Minas Gerais
2022,5.6,4.5
2023,5.8,5.0


In [50]:
analise2.reset_index(inplace=True)

In [51]:
analise2.columns

Index(['index', 'São Paulo', 'Minas Gerais'], dtype='object')

In [52]:
analise2.columns = ['ANO', 'SP', 'MG']

In [53]:
analise2.head()

Unnamed: 0,ANO,SP,MG
0,2022,5.6,4.5
1,2023,5.8,5.0


# Indexação, Seleção e Filtragem com Pandas

In [55]:
import pandas as pd
import numpy as np

In [56]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [57]:
obj['b']

1.0

In [59]:
obj[1]

1.0

In [60]:
obj[:2]

a    0.0
b    1.0
dtype: float64

In [61]:
obj[2:]

c    2.0
d    3.0
dtype: float64

In [62]:
obj[obj > 2]

d    3.0
dtype: float64

In [63]:
obj[obj > 2] = 88

In [65]:
obj

a     0.0
b     1.0
c     2.0
d    88.0
dtype: float64

In [72]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['AC', 'AL', 'AP', 'AM'], columns=['Um', 'Dois', 'Três', 'Quatro'])
data.head()

Unnamed: 0,Um,Dois,Três,Quatro
AC,0,1,2,3
AL,4,5,6,7
AP,8,9,10,11
AM,12,13,14,15


In [67]:
data['Dois']

AC     1
AL     5
AP     9
AM    13
Name: Dois, dtype: int64

In [68]:
data[['Dois', 'Três']]

Unnamed: 0,Dois,Três
AC,1,2
AL,5,6
AP,9,10
AM,13,14


In [69]:
data[data['Dois'] >= 3]

Unnamed: 0,Um,Dois,Três,Quatro
AL,4,5,6,7
AP,8,9,10,11
AM,12,13,14,15


In [71]:
data[data['Dois'] >= 3] = -3
data

Unnamed: 0,Um,Dois,Três,Quatro
AC,0,1,2,3
AL,-3,-3,-3,-3
AP,-3,-3,-3,-3
AM,-3,-3,-3,-3


# log e iloc

In [74]:
data.loc['AP', ['Três', 'Dois']]

Três    10
Dois     9
Name: AP, dtype: int64

In [75]:
data.iloc[:2]

Unnamed: 0,Um,Dois,Três,Quatro
AC,0,1,2,3
AL,4,5,6,7


In [78]:
data.loc[(data['Três'] > 2), 'Quatro'] = 99
data

Unnamed: 0,Um,Dois,Três,Quatro
AC,0,1,2,3
AL,4,5,6,99
AP,8,9,10,99
AM,12,13,14,99


# Aplicação de funções e mapeamento com Pandas

In [79]:
import pandas as pd
import numpy as np

In [80]:
df = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['PA', 'MA', 'CE', 'BA'])
df.head()

Unnamed: 0,b,d,e
PA,-2.257294,-0.93982,1.766775
MA,0.557383,1.041378,1.341642
CE,0.006513,-0.358762,0.559001
BA,-0.482069,-0.604513,-1.640279


In [82]:
np.abs(df)

Unnamed: 0,b,d,e
PA,2.257294,0.93982,1.766775
MA,0.557383,1.041378,1.341642
CE,0.006513,0.358762,0.559001
BA,0.482069,0.604513,1.640279


In [84]:
f = lambda x: x.max() - x.min()

In [85]:
df.apply(f)

b    2.814677
d    1.981198
e    3.407054
dtype: float64

In [86]:
df.apply(f, axis='columns')

PA    4.024069
MA    0.784259
CE    0.917763
BA    1.158210
dtype: float64

In [87]:
def f(x):
  return pd.Series([x.min(), x.max()], index=['min', 'max'])

In [88]:
df.apply(f)

Unnamed: 0,b,d,e
min,-2.257294,-0.93982,-1.640279
max,0.557383,1.041378,1.766775


In [89]:
format = lambda x: '%.2f' % x

In [90]:
df.head()

Unnamed: 0,b,d,e
PA,-2.257294,-0.93982,1.766775
MA,0.557383,1.041378,1.341642
CE,0.006513,-0.358762,0.559001
BA,-0.482069,-0.604513,-1.640279


In [91]:
df.applymap(format)

Unnamed: 0,b,d,e
PA,-2.26,-0.94,1.77
MA,0.56,1.04,1.34
CE,0.01,-0.36,0.56
BA,-0.48,-0.6,-1.64


In [92]:
df['e'].apply(format)

PA     1.77
MA     1.34
CE     0.56
BA    -1.64
Name: e, dtype: object

# Ordenação

In [93]:
import pandas as pd
import numpy as np

In [94]:
df = pd.DataFrame(np.arange(8).reshape((2, 4)),
                  index=['RO', 'AC'],
                  columns=['d', 'a', 'b', 'c'])

df.head()

Unnamed: 0,d,a,b,c
RO,0,1,2,3
AC,4,5,6,7


In [95]:
df.sort_index()

Unnamed: 0,d,a,b,c
AC,4,5,6,7
RO,0,1,2,3


In [96]:
df.sort_index(ascending=False)

Unnamed: 0,d,a,b,c
RO,0,1,2,3
AC,4,5,6,7


In [97]:
df2 = pd.DataFrame(
    {
        'b': [4, 7, -3, 2],
        'a': [0, 1, 0, 1]
    }
)

df2.head()

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [98]:
df2.sort_values('b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [99]:
df2.sort_values('b', ascending=False)

Unnamed: 0,b,a
1,7,1
0,4,0
3,2,1
2,-3,0


# Resumindo e calculando estatísticas descritivas com Pandas

In [100]:
import pandas as pd
import numpy as np

In [103]:
df = pd.DataFrame(
    [
        [1.4, np.nan],
        [7.1, -4.5],
        [np.nan, np.nan],
        [0.75, -1.3]
    ],
    index = ['a', 'b', 'c', 'd'],
    columns = ['MG', 'PR']
)

df.head()

Unnamed: 0,MG,PR
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [104]:
df.sum()

MG    9.25
PR   -5.80
dtype: float64

In [105]:
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [106]:
df.mean()

MG    3.083333
PR   -2.900000
dtype: float64

In [110]:
df.mean(axis='columns', skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [112]:
df.head()

Unnamed: 0,MG,PR
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [111]:
df.cumsum()

Unnamed: 0,MG,PR
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [113]:
df.describe()

Unnamed: 0,MG,PR
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3
