# pandas: A Short Tutorial
-  pandas contains data structures and data manupulation tools for fast and easy data cleaning and analysis. 
-  pandas is often used in tandem with Numpy, scikit-learn and matplotlib. 
-  pandas is designed for working with tabular or heterogeneous data, while Numpy is best suited for working with homegeneous numerical array data.

In [None]:
# import pandas
import pandas as pd
from pandas import Series, DataFrame
# import Numpy
import numpy as np

## pandas Data Structures

### Series
-  A Series is a one-dimensional array-like object containing a sequence of values and data labels (index).
-  Attributes
    - .index returns an *Index* object
    - .values returns an ndarray

In [None]:
# a simple Series; a default index 0 to 3
obj = Series([1, -2, 3, -4])
obj

In [None]:
# creating a Series with an index
aobj = Series([1, -2, 3, -4], index = ['a', 'c', 'b', 'd'])
aobj

In [None]:
# Index objects are immutable
print(aobj.index)
aobj.index[1:]

In [None]:
# similar to a dictionary
'a' in aobj

In [None]:
# insert an object
aobj['e'] = 5
aobj

In [None]:
# delete an object
del aobj['e']
aobj

In [None]:
# data retrieval by label
print(aobj['c'])
aobj[['a', 'b', 'c']]   # a list of indices

In [None]:
# slicing by label
aobj['a':'b']     # the last label is inclusive!

In [None]:
# data retrieval by integer position
print(aobj[1])
# slicing
aobj[:2]   # the last integer position is exclusive!

In [None]:
# revising index
aobj.index = ['e', 'g', 'h', 'f']
aobj

In [None]:
# .values attribute
aobj.values    # a numpy 1-d array 

In [None]:
# filtering
aobj[aobj > 0]

In [None]:
# vectorization
aobj * 2

In [None]:
# numpy universal function
np.abs(aobj)

In [None]:
# converting a dictionary to a Serie
sdict = {'Mike': 3.4, 'Mary': 3.8, 'David': 3.0, 'Jack': 2.75}
sobj = Series(sdict)
sobj

In [None]:
# NaN = missing values
sobj2 = pd.Series(sdict, index = ['Alex', 'David', 'Jack', 'Mary'] )
sobj2

In [None]:
# isnull() and notnull() detect missing values
print(pd.isnull(sobj2))
print(pd.notnull(sobj2))
sobj2.isnull()

In [None]:
# data alignment 
(sobj + sobj2)/2

In [None]:
# .name property
sobj.name = 'GPA'
sobj.index.name = 'Student'
sobj

### DataFrame
-  A DataFrame represents a (two-dimensional) table of data.
-  The DataFrame has both a row and a column index.
-  Attributes
    - .index returns the row index
    - .columns returns the column index
    - .values returns the data as an ndarray

In [None]:
# creating a DataFrame from a dictionary
adict = {'name': ['Mary', 'Betty', 'Jack', 'Scott', 'David', 'Lily'], 'gender': ['F',
          'F', 'M', 'M', 'M', 'F'], 'gpa': [3.5, 3.1, 2.75, 3.8, 2.9, 2.5]}
adf = DataFrame(adict)
adf

In [None]:
# head and tail methods
print(adf.head())
adf.tail()

In [None]:
# NaN = missing values
adf2 = DataFrame(adict, columns = ['name', 'gender', 'gpa', 'major'], 
                    index = ['S1', 'S2', 'S3', 'S4', 'S5', 'S6'])
adf2

In [None]:
# .columns and .index
print(adf2.columns)
adf2.index

In [None]:
# a column retrieved as a Series
print(adf2['gpa'])   # by dict-like notation
adf2.gpa    # by attribute

In [None]:
# multiple columns
print(adf2[['name', 'gpa']])
adf2[adf2.columns[1:3]]

In [None]:
# a row retrieved as Series by the loc attribute
adf2.loc['S3']

In [None]:
# multiple rows
print(adf2.loc[['S1', 'S3']])
print(adf2[1:3])
(adf2['S1':'S3'])   # the last label is inclusive!

In [None]:
# adding a column by assignment
adf2['honor'] = adf2.gpa >= 3.5
adf2

In [None]:
# revising a column
major = Series(['MGT', 'MGT', 'ACC', 'FIN', 'FIN', 'ACC'], 
                 index = ['S1', 'S3', 'S4', 'S6', 'S2', 'S5'])
adf2['major'] = major
adf2

In [None]:
# deleting columns
del adf2['honor']
adf2

In [None]:
# .values attribute returns data as a two-dimensional ndarray
adf2.values

## Interacting with Data

### Indexing, Selection and Filtering
- Indexing options
    - df[val]  
        - column label or labels: a single column or subset of columns; 
        - boolean array: selected rows;
        - slicing: slicing rows
    - df.loc[val] or df.iloc[where] 
        - a single row or subset of rows
    - df.loc[:, val] or df.iloc[:, where] 
        - a single column or subset of columns
    - df.loc[val_i, val_j] or df.iloc[where_i, where_j]  
        - both rows and columns
   
   
 - .loc for labels and .iloc for integer positions

In [None]:
# Series indexing similar to Numpy array indexing
bobj = Series({val: key for key, val in enumerate('abcde')})
print(bobj)
bobj[1:3]

In [None]:
# Series indexing with index values
bobj[['b', 'c', 'e']]

In [None]:
# DataFrame indexing by column
bdf = DataFrame({val: [key * m for m in range(4)] for key, val in enumerate('abcde')})
print(bdf)
bdf[['a', 'd']]

In [None]:
# DataFrame indexing by row
bdf[1:3]

In [None]:
# DataFrame masking
bdf[bdf < 5] = 5
bdf

In [None]:
# selection both rows and columns with .loc
bdf.loc[[1, 3], ['a', 'e']]

In [None]:
# selection by integer position with .iloc
bdf.iloc[[1,3], [2,4]]

In [None]:
# slicing
print(bdf.loc[:3, :'d'])    # the last label is inclusive
bdf.iloc[:, :3][bdf.d < 6]    # the last integer position is exclusive

In [None]:
# a single column or subset of columns
print(bdf.loc[:, ['a', 'b']])
bdf.iloc[:, 0:2]

### Arithmetic Operations and Data Alignment
- Arithmetics between objects with different indexes
    - Index in the result will be the union of the index pairs.
    - Missing values are introduced in the label locations that don't overlap.
- Operations between DataFrame and Series are defined.

In [None]:
# objects with different indexes
df1 = DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = DataFrame({'A': [5, 6]})
print(df1)
print(df2)
df1 + df2    # NA values would be introduced

In [None]:
# a filled value when an axis label is missing in an object
df1.add(df2, fill_value = 0)

### Function Applications
- Numpy ufuncs also work with pandas objects.
- apply()

In [None]:
cdf = DataFrame(np.random.randn(3, 4), columns = list('abcd'), 
                   index = list('ABC'))
print(cdf)
np.abs(cdf)

In [None]:
# apply() runs on each column be default
print(cdf.apply(lambda x: x.max() - x.min()))
# apply() runs on each row
cdf.apply(lambda x: x.max() - x.min(), axis = 1)

### Sorting and Ranking
- sort_index()
- sort_values()
- rank()

In [None]:
# sorting by row index
ddf = DataFrame(np.arange(12).reshape((3, 4)), columns = list('bdac'), 
                   index = list('BCA'))
print(ddf)
ddf.sort_index()

In [None]:
# sorting by column index
ddf.sort_index(axis = 1)

In [None]:
# sorting in descending order
ddf.sort_index(ascending = False)

In [None]:
# sorting by a column
ddf.sort_values(by = 'a')
ddf

In [None]:
# sorting by multiple columns
edf = ddf.copy()
edf['d'] = 5
print(edf)
edf.sort_values(['d', 'b'], ascending = False)

In [None]:
# ranks assigned
print(edf)
edf.rank()    # from smallest to largest

In [None]:
edf.rank(method = 'max', ascending = False)   # tie-breaking method: highest rank

## Descriptive Statistics

In [None]:
# summary statistics by column
np.random.seed(12345)
fdf = DataFrame(np.random.randn(4, 3), columns = list('abc'), 
                   index = list('ABCD'))
print(fdf)
print(fdf.sum())
print(fdf.mean())
print(fdf.max())
print(fdf.idxmax())   # index

In [None]:
# summary statistics by row
fdf.sum(axis = 1)

In [None]:
# multiple summary statistics
fdf.describe()

In [None]:
fdf.loc['A', 'c'] = 'NaN'
print(fdf)
fdf.sum()    # exclusing missing values

In [None]:
# correlation
fdf.a.corr(fdf.b)

In [None]:
# unique values and value counts
ser3 = Series(['a', 'b', 'a', 'c', 'b', 'a'])
print(ser3.unique())
ser3.value_counts()

In [None]:
# membership
print(ser3.isin(['b', 'c']))
ser3[ser3.isin(['b', 'c'])]

In [None]:
# frequency distribution by column
gdf = DataFrame({'q1': [1, 3, 4, 5, 3], 'q2': [2, 2, 3, 5, 1], 
                    'q3': [3, 1, 2, 4, 1]})
print(gdf)
freq = gdf.apply(pd.value_counts).fillna(0)
freq

In [None]:
# groupby
gdf['q4'] = Series(list('abadb'))
print(gdf)
gdf.groupby('q4').max()