In [None]:
import pandas as pd
import numpy as np

# Objetos em Pandas
## Series

In [None]:
series = pd.Series(np.arange(0, 1, 0.25))
series

In [None]:
series.values

In [None]:
series.index

In [None]:
series[0]

In [None]:
series[0:2]

In [None]:
##Indexação da serie
series = pd.Series(np.arange(0, 1, 0.25),
                   index=['a', 'b', 'c', 'd'])
series

In [None]:
series[0:2]

In [None]:
series['b']

In [None]:
series = pd.Series(np.arange(0, 1, 0.25),
                   index=[2, 3, 4, 5])
series

In [None]:
series[0:2]

In [None]:
series[4]

In [None]:
series[2]

In [None]:
##podemos criar a partir de um dicionário
populacao_dict = {'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}
populacao = pd.Series(populacao_dict)
populacao

In [None]:
pd.Series({2:'a', 1:'b', 3:'c'})

In [None]:
pd.Series({2:'a', 1:'b', 3:'c'}, index=[2,1])

## DataFrame

In [None]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

In [None]:
estados = pd.DataFrame({'populacao': populacao,
'area': area})
estados

In [None]:
estados['area']

In [None]:
estados.index

In [None]:
estados.columns

In [None]:
estados.columns.to_list()

In [None]:
##Criando DataFrame de uma série
pd.DataFrame(populacao)

In [None]:
pd.DataFrame(populacao, columns=['pop'])

In [None]:
## Lista de dicionários

dados = [{'col_1': i, 'col_2': 3*i} for i in range(5)]
dados

In [None]:
pd.DataFrame(dados)

In [None]:
dados = [{'col_1': 2, 'col_2': 4}, {'col_2': 5, 'col_3': 1}]
df = pd.DataFrame(dados)
df

In [None]:
df['col_2']

In [None]:
##De um array de duas dimensões
dados = np.random.rand(3, 2)
dados

In [None]:
pd.DataFrame(dados,
columns=['col_1', 'col_2'],
index=['a', 'b', 'c'])

In [None]:
##De um dado estruturado
dados = np.array([('Gabriel', 30, 6),
                  ('Ana', 25, 4),
                  ('Amanda', 27, 3.5)], dtype={'names': ('nome', 'idade', 'notas'), 'formats': ('U10', 'i4', 'f8')})

pd.DataFrame(dados)

## Index

In [None]:
ind = pd.Index([2, 3, 5, 7, 11])
ind[:2]

In [None]:
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])
indA.intersection(indB)

In [None]:
indA.union(indB)

In [None]:
indA.symmetric_difference(indB)

# Selecionando Valores em uma Serie

In [None]:
series = pd.Series(np.arange(0, 1, 0.25),
                   index=['a', 'b', 'c', 'd'])

In [None]:
'e' in series

In [None]:
series.keys()

In [None]:
list(series.items())

In [None]:
series['e'] = 1
series

In [None]:
series[0:2]

In [None]:
series[(series>0.3) & (series<1)]

In [None]:
series[['a', 'b']]

##loc

Referente ao indice explicito

In [None]:
series = pd.Series(np.arange(0, 1, 0.25),
                   index=[2, 3, 4, 5])
series

In [None]:
series[2]

In [None]:
series[2:]

In [None]:
series.loc[2]

In [None]:
series.loc[2:]

## iloc
Referente ao indice implicito

In [None]:
series.iloc[2]

In [None]:
series.iloc[2:]

# Selecionando valor em um DataFrame

In [None]:
area = pd.Series({'California': 423967, 'Texas': 695662,
'New York': 141297, 'Florida': 170312,
'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127, 'Florida': 19552860,
'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

In [None]:
data['densidade'] = data['pop']/data['area']

In [None]:
data

In [None]:
data['area']

In [None]:
data.area

In [None]:
data.pop is data['pop'] #pois pop é método de dataframe

In [None]:
data.values

In [None]:
data.T

In [None]:
data.values[0]

In [None]:
data

In [None]:
data.loc[:'Florida', :'pop']

In [None]:
data.loc[:'Florida', :]

In [None]:
data.loc[:'Florida', 'area']

In [None]:
data.iloc[:3, :1]

In [None]:
data.loc[data.area> 400000, ['pop', 'area']]

In [None]:
data['Florida':'Illinois']

In [None]:
data[['area', 'pop']]

In [None]:
data[1:3]

In [None]:
data[data.densidade > 100]

# Operações em objetos Pandas

In [None]:
rng = np.random.RandomState(42)
serie = pd.Series(rng.randint(0, 10, 4))
serie

In [None]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),
columns=['A', 'B', 'C', 'D'])
df

In [None]:
np.exp(serie)

In [None]:
np.log(df)

## Preservação da indexação em series

In [None]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
'California': 423967}, name='area')
populacao = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127}, name='populacao')

In [None]:
populacao/area

In [None]:
ser1 = pd.Series([2, 4, 6], index=[0, 1, 2])
ser2 = pd.Series([1, 3, 5], index=[1, 2, 3])
ser1 + ser2

In [None]:
ser1.add(ser2, fill_value=0)

## Preservação dos indices em DataFrame

In [None]:
df1 = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                   columns=list('AB'))
df1

In [None]:
df2 = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                   columns=list('BAC'))
df2

In [None]:
df1 + df2

In [None]:
df1.add(df, fill_value=5)

In [None]:
df2

In [None]:
df2 - df2.iloc[0]

In [None]:
df2 - df2.loc[0, ['B', 'C']]

# Leitura de arquivo

In [None]:
path = 'Sales_Transactions_Dataset_Weekly.csv'

df_csv = pd.read_csv(path)
df_csv.head()

# Modificando nomes de coluna/index

In [None]:
df1 = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                   columns=list('AB'))

In [None]:
df1.rename(columns={'A': 'a', 'B': 'b'})

In [None]:
df1.rename(index={0: 'a', 1:'b'})

In [None]:
df1.rename(index=str).index

In [None]:
df1.rename(str.lower, axis='columns')

In [None]:
df1.rename(lambda x: x+1, axis='index')

# Criando colunas novas

In [None]:
data

In [None]:
data['density'] = data['pop']/data['area']

In [None]:
data.loc[data.density < 100, 'marcacao'] = 1
data

# Removendo duplicatas

In [None]:
df = pd.DataFrame({'col_1': [1, 1, 2, 2, 3, 4],
                   'col_2': [1, 2, 2, 2, 5, 7]})
df

In [None]:
df.drop_duplicates()

In [None]:
df.drop_duplicates(subset=['col_1'])

In [None]:
df.drop_duplicates(subset=['col_1'], keep='last')

In [None]:
df.drop_duplicates(subset=['col_1'], keep='last', ignore_index=True)

# Funcionalidade básica

In [None]:
data

In [None]:
data.shape

In [None]:
data.drop('marcacao', axis='columns', inplace=True)
data

In [None]:
data.head(2)

In [None]:
data.tail(2)

## Descritivas

In [None]:
data.mean()

In [None]:
data.mean(1)

In [None]:
data.sum(axis=0)

In [None]:
data.sum(axis=1)

In [None]:
data.std()

In [None]:
data.cumsum()

In [None]:
data.quantile([0.1, 0.25, 0.5])

### Funções

|Função|Descrição|
|-------|---------|
|count|Conta o número de observações (remove o NA)|
|sum|Soma dos valores|
|mean|Média dos valores|
|median|Mediana dos valores|
|min|Minimo dos valores|
|max|Maximo dos valores|
|mode|Moda dos valores|
|abs|Valor absoluto|
|prod|Produto dos valores|
|std|Desvio-padrão|
|var|Variância|
|quantile|Quantil dos dados|
|cumsum|Soma acumulado|
|cumprod|Produto acumulado|
|cummax|Máximo acumulado|
|cummin|Minimo acumulado|

In [None]:
data.describe()

In [None]:
data.idxmin()

In [None]:
data.idxmax()

In [None]:
df3 = pd.DataFrame({'col_1': [1, 1, 1, 2], 'col_2': ['a', 'a', 'b', 'b']})

df3.value_counts()

In [None]:
df3.value_counts(normalize=True)

In [None]:
df3.col_1.value_counts(normalize=True)

In [None]:
##Criando intervalo dos dados

pd.cut(data.area, 2)

In [None]:
data.quantile([0, 0.25, 0.5, 0.75, 1])

In [None]:
pd.qcut(data.area, [0, 0.25, 0.5, 0.75, 1])

In [None]:
data_q = data.copy()

data_q['quantil'] = pd.qcut(data.area, [0, 0.25, 0.5, 0.75, 1])
data_q

In [None]:
data.sort_index()

In [None]:
data.sort_values('area')

In [None]:
data.reset_index()

In [None]:
data.dtypes

In [None]:
data.reset_index().dtypes

# Dados faltante

In [None]:
df_none = pd.Series([1, 2, None])
df_none

In [None]:
df_none = pd.Series([1, 2])
df_none

In [None]:
df_none[0] = None
df_none

In [None]:
df_none = pd.Series(['a', 'b', None])
df_none

In [None]:
df_none = pd.Series(['a', 'b', float('nan')])
df_none

In [None]:
df_none.isnull()

In [None]:
df_none.notnull()

In [None]:
df_none.dropna()

In [None]:
df = pd.DataFrame([[1, np.nan, 2],
                   [2, 3, 4],
                   [np.nan, 5, 6]])
df

In [None]:
df.dropna()

In [None]:
df.dropna(axis='columns')

In [None]:
df.dropna(axis='columns', how='all')

In [None]:
df[3] = np.nan
df

In [None]:
df.dropna(axis='columns', how='all')

In [None]:
df.dropna(axis='columns', thresh=2)

In [None]:
df.dropna(axis='rows', thresh=3) #numero minimo de não nulo

In [None]:
df.fillna(0)

In [None]:
df.fillna(method='ffill')

In [None]:
df.fillna(method='bfill')

In [None]:
df.fillna(method='ffill', axis=1)

# Multi Index

In [None]:
index = [('California', 2000), ('California', 2010),
('New York', 2000), ('New York', 2010),
('Texas', 2000), ('Texas', 2010)]
populacao = [33871648, 37253956,
18976457, 19378102,
20851820, 25145561]
pop = pd.Series(populacao, index=index)
pop

In [None]:
pop[('California', 2000)]

In [None]:
pop[[idx for idx in pop.index if idx[1]== 2010]]

In [None]:
pop.reindex()

In [None]:
index = [('California', 2000), ('California', 2010),
('New York', 2000), ('New York', 2010),
('Texas', 2000), ('Texas', 2010)]
index = pd.MultiIndex.from_tuples(index)

In [None]:
pop = pop.reindex(index)

In [None]:
pop

In [None]:
pop[:, 2010]

In [None]:
pop['New York', :]

In [None]:
pop_df = pop.unstack()
pop_df

In [None]:
pop_df.stack()

In [None]:
df = pd.DataFrame(np.random.rand(4, 2),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=['data1', 'data2'])
df

In [None]:
pop

In [None]:
pop.index.names = ['estado', 'ano']
pop

In [None]:
pop.unstack()

In [None]:
pop.unstack(level=0)

In [None]:
pop.unstack(level=0).loc[2000]

In [None]:
pop.unstack(level=1)

In [None]:
pop.unstack(level=1).loc['California']

In [None]:
df_reset = pop.reset_index(name='populacao')
df_reset

In [None]:
df_reset.set_index(['estado'])

In [None]:
df_reset.set_index(['ano', 'estado'])

In [None]:
df_reset.set_index(['ano', 'estado']).loc[2010, :]

In [None]:
df_reset.sort_values('ano').set_index(['ano', 'estado'])

In [None]:
pop.groupby(level=1).mean()