# Constructing Series or DataFrame

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## 2-D DataFrame

In [None]:
# from a list or tuple
a = pd.DataFrame([[1,2, 'a'], [3,4,'b']])
a

In [None]:
# you can specify the columns and indexes
b = pd.DataFrame([[1,2, 'a'], [3,4,'b']], columns=['col1', 'col2', 'col3'], index=['row1', 'row2'])
b

In [None]:
print(b.index)
print(b.columns)

In [None]:
# index can be non-unique
b = pd.DataFrame([[1,2, 'a'], [3,4,'b'], [5,6,'c']], index=['row1', 'row1', 'row3'])
b

In [None]:
b.loc['row1'] # get two rows

In [None]:
b.loc['row3'] # reduce to Series, column names become indexes for the resulting Series

In [None]:
# from a dictionary
a = {'col1':[1, 2, 3], 'col2':[4, 5, 6], 'col3':[7, 8, 9]}
pd.DataFrame(a)

In [None]:
# construct data frame similar to np.full()
pd.DataFrame(8, index=range(5), columns=range(5))

In [None]:
# important properties
print(b.index)
print(b.index.name) # this is None
print(b.columns)

In [None]:
# set index name
b.index.name = 'indexName'
b

In [None]:
# add a new column 
b['B'] = np.random.randint(1, 10, 3)
b

In [None]:
# add a new row
b.loc['row4'] = 8, 9, 'd', 10
b

## 1-D Series

In [None]:
# from a list or tuple
print('from list\n',  pd.Series([1, 2, 3, 4]), '\n\n')
print('from tuple\n', pd.Series((1, 2, 3, 4)))

In [None]:
# specify the index
a = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
a

In [None]:
a.index

In [None]:
# from a dictionary
a = {'a':1, 'b':2, 'c':3, 'd':4}
print(a, '\n')
b = pd.Series(a)
print( b)

In [None]:
# set the series name
b.name = 'seriesName'
b

In [None]:
# convert Series to DataFrame
b.to_frame()

# IO

In [None]:
corn = pd.read_csv('dryland_corn_acres_state.csv', index_col=0)
corn

In [None]:
help(pd.read_csv)

# Indexing and selecting

## `[]` indexing

In [None]:
# pd.date_range() is very frequently used to construct datetime index
dates = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

In [None]:
# [] indexing
s = df['A']
s

In [None]:
# multiple columns
df[['A', 'C']]

In [None]:
# Attribute access
df.A

In [None]:
# cannot define a new column using a attribute
df.F = 1
df.F

In [None]:
# [] indexing for Series
s['2000-01-02']

In [None]:
# it works mostly the same as .loc[] for Series
s['20000102':'20000105']

In [None]:
# if it is an integer, it works like .iloc[]
print('s[1]', s[1])
print('s[1:5]\n', s[1:5])
print('s[::2]\n', s[::2])

## `.loc[]` indexing (Selection by label)

In [None]:
df = pd.DataFrame(np.random.randn(5, 4), columns=list('ABCD'), index=pd.date_range('20130101', periods=5))
df

In [None]:
# be aware that 01-04 is included
df.loc['2013-01-02':'2013-01-04']

In [None]:
# be careful while use for numeric index
df = pd.DataFrame(np.random.randn(5, 4), columns=list('ABCD'), index=range(1, 6))
df

In [None]:
# this is the first row
df.loc[1]

In [None]:
df.loc[0]

In [None]:
# slicing
df.loc[1:4]

In [None]:
# slicing on either axis
df.loc[:, 'B':]

In [None]:
# using a list of labels
df.loc[[1, 3, 5], ['B', 'D']]

In [None]:
# find numbers larger than ZERO in A columns using boolean index
df.loc[df.A > 0]

In [None]:
# it can also be indexed with a function
df.loc[lambda x: x.A > 0]

In [None]:
# be careful the indexes do not have to be sorted
df = pd.DataFrame(np.random.randn(5, 4), columns=list('CBDA'), index=[5, 3, 7, 6, 2])
df

In [None]:
# what to expect?
df.loc[3:6]

In [None]:
# sort_index
df.sort_index().loc[3:6]

In [None]:
# you can also sort the columns
df.sort_index(axis='columns').loc[:,'B':'D']

## `.iloc[]` indexing (Selection by positions)

In [None]:
s1 = pd.Series(np.random.randn(5), index=list(range(0, 10, 2)))
s1

In [None]:
# iloc index based on positions. It doesn't matter what the index values are
s1.iloc[2:4]

In [None]:
# iloc index
s1.iloc[[1, 3]]

In [None]:
# set a subset of the data frame
df = pd.DataFrame(np.random.randn(5, 4), columns=list('CBDA'), index=[5, 3, 7, 6, 2])
df.iloc[2:4, 1:3] = np.nan
df

In [None]:
# isin is very useful e.g. bfind Nebraska, Kansas and Corolado corn yield
corn[corn.state_alpha.isin(['NE', 'KS', 'CO'])]

In [None]:
df = pd.DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], 'ids2': ['a', 'n', 'c', 'n']})
df

In [None]:
values = {'ids': ['a', 'b'], 'ids2': ['a', 'c'], 'vals': [1, 3]}
row_mask = df.isin(values).all('columns')
row_mask

## Selecting data using where() and query()

In [None]:
df = pd.DataFrame(np.random.randn(5, 4), columns=list('ABCD'), index=range(1, 6))
df

In [None]:
# Selecting values from a Series with a boolean vector generally returns a subset of the data
df.A[df.A <= 0]

In [None]:
# Selecting values from a DataFrame with a boolean criterion now also preserves input data shape
df[df<0]

In [None]:
#define a replacement value when condition is false; similar to np.where
df.where(df<0, -df)

In [None]:
# you can use np.where to achieve this
df[:] = np.where(df<0, df, -df)
df

In [None]:
# using query()
df = pd.DataFrame(np.random.rand(10, 3), columns=list('abc'))
df

In [None]:
# get the value of the frame where column b has values between the values of columns a and c.
df[(df['a'] < df['b']) & (df['b'] < df['c'])]

In [None]:
# using query
df.query('(a < b) & (b < c)')

In [None]:
# query can also apply to index
df.query('2 <= index <= 6 ')

## Dropping rows or columns, duplicates

In [None]:
# using index: not very convinient
df.iloc[:-1, :]

In [None]:
# better method: drop
df.drop(3, axis='index')

In [None]:
# can drop columns
df.drop(['ids', 'ids2'], axis='columns')

In [None]:
# find duplicated
s = pd.Series(np.random.randint(1, 4, 8))
s

In [None]:
# you could use unique() to find the unique valus
s.unique()

In [None]:
s.duplicated()

In [None]:
# remove duplicates
s[~s.duplicated()]

In [None]:
# or using deticated function
s.drop_duplicates()

## Set and reset index

In [None]:
df = pd.DataFrame(np.random.randint(1, 100, (10, 5)), columns=list('abcde'))
df

In [None]:
# reset index
df.reset_index()

In [None]:
# use a data column as the new index
df.set_index('a')

## Sorting

In [None]:
b = df.sort_values('a')
b

In [None]:
# sort two columns using different orders
df.loc[:4, 'a'] = df.loc[5:, 'a'].values
df.sort_values(['a', 'b'], ascending=[False, True])

In [None]:
# sort the index
b.sort_index(ascending=False)

# Concatenate, merge, join

In [None]:
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2'], 'E': ['E0', 'E1', 'E2']}, index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'], 'D': ['D0', 'D2', 'D3'], 'E': ['E1', 'E2', 'E3']}, index=['K0', 'K2', 'K3'])

print(left, '\n')
print(right)

In [None]:
# concat basically just combine two objects
pd.concat([left, right])

In [None]:
# merge is very flexible
pd.merge(left, right, on='E')

In [None]:
# join is almost the same as merge but apply on the index
left.join(right, how='inner')

# Computational

## Binary operations

In [None]:
df1 = pd.DataFrame(np.ones((3, 4), dtype=int))
df2 = pd.DataFrame(np.random.randint(1, 10, (3, 4)))

print(df1, '\n')
print(df2)

In [None]:
# similar to numpy
df1 + df2 

In [None]:
# 2-D broadcasting
df2 + 1

In [None]:
# broadcasting
df1 + df2.loc[0, :]

In [None]:
# the result below is interesting!
df1 + df2.loc[:, 0]

In [None]:
# alignment with index and columns
A = pd.DataFrame(1, index=[1,2,3], columns='a1,a2,a3'.split(','))
B = pd.DataFrame(2, index=[2,3,4], columns='a2,a3,a4'.split(','))

print(A, '\n')
print(B)

In [None]:
A + B

## Statistics

In [None]:
df1.sum()

In [None]:
# like numpy you can select an axis for the statistics operation
df1.sum(axis=1)

In [None]:
# load example data
df_weather = pd.read_csv('weather.data.csv')
df_weather

In [None]:
# check the param
df_weather.param.unique()

In [None]:
df_weather.query('param == " Precipitation" & siteid == "Ames"').quantile([0.25, 0.5, 0.75, 0.9])