# Introduction to Data Analysis with Python

# Objective:

* Handle tabular data with `pandas`

## `pandas`

### Getting started with pandas

In [None]:
import pandas as pd
import numpy as np

### `pandas` data structures

### Series

The base pandas abstraction. You can thing of it as the love child of a numpy array and a dictionary.

In [None]:
s = pd.Series([4, 7, -5, 3])
s

If we provide an index, pandas will use it. If not, it will automatically create one.

In [None]:
print(s.index)
print(s.values)

In [None]:
s2 = pd.Series([1, 2, 4.5, 7, 2, 23, 15], index=['i', 'f', 'n', 'e', 'u', 'r', 'h'])
s2

In [None]:
s2['r']

In [None]:
s2 > 3

In [None]:
s2[s2>3]

In [None]:
evens = s2 % 2 == 0

In [None]:
s2[evens]

In [None]:
s2

In [None]:
s2 * 2

In [None]:
np.exp(s2)

In [None]:
'f' in s2

In [None]:
clase = pd.Series([35, 22, 45, 72], index=['Toni', 'Fulanito', 'Menganito', 'Victor'])

In [None]:
clase

In [None]:
clase[clase==22].index

We can create Series from dictionaries:

In [None]:
sdata = {'B' : 3e6, 'M': 6e6, 'P': 1.2e5, 'V': 7e5}

s3 = pd.Series(sdata)
s3

In [None]:
increase = {'M': 4e5, 'B' : 2e5, 'Z': -2e4}

s4 = pd.Series(increase)

And here is where the magic happens: numpy arrays only identify their contents by position. In contrast, pandas knows their "name" and will align them based on their indexes:

In [None]:
s3.values

In [None]:
s4.values

In [None]:
s3.values + s4.values

In [None]:
s3 + s4

In [None]:
s3.name = 'population_2000'
s3.index.name = 'province'

In [None]:
s3

In [None]:
s3.index

### DataFrame

This is the object you'll work most of the time with. It represents a table of _m_ observations x _n_ variables. Each variable, or column, is a Series.

In [None]:
dfdata = {
    'province' : ['M', 'M', 'M', 'B', 'B'],
    'population': [1.5e6, 2e6, 3e6, 5e5, 1.5e6],
    'year' : [1900, 1950, 2000, 1900, 2000]   
}

df = pd.DataFrame(dfdata)
df

In [None]:
df2 = pd.DataFrame(dfdata, columns=['province','population', 'year', 'debt'])
df2

In [None]:
df2['kk'] = dfdata['population']

In [None]:
df2

In [None]:
df2.index

In [None]:
df2.columns

In [None]:
df2[['population','province']]

In [None]:
df2.population

In [None]:
df2['2nd_language']=list('EEFFG')

In [None]:
df2

In [None]:
df2['2nd_language']

In [None]:
df2.2nd_language

In [None]:
# df2['abs']

In [None]:
df2.index = list('abcde')

In [None]:
df2

In [None]:
df2.loc['a']

In [None]:
df2['debt'] = 10
df2

In [None]:
df2['debt'] = [1,0,2,.5,.7]
df2

In [None]:
df2['capital'] = df2['province'] == 'M'
df2

# df2.T

In [None]:
df2T= df2.T

In [None]:
df2T.index

In [None]:
df2

In [None]:
df2.describe()

In [None]:
df2.describe().T

### Index objects

Indexes are immutable.

In [None]:
df2.index[1] = 'x'

In [None]:
df2.index[1]

In [None]:
df2.iloc[2:]

### Dropping entries from an axis

In [None]:
s5 = pd.Series(np.arange(5), list('jduvk'))
s5

In [None]:
s6 = s5.drop(['d','k'])
s6

In [None]:
s5

In [None]:
s5.drop(['d','k'],inplace=False)

In [None]:
s5

By default, `drop()` doesn't modify the original Series- it creates a copy. We can change that with the argument `inplace`.

In [None]:
s5

In [None]:
s6['u'] = 7
s6

In [None]:
df2

In [None]:
df2.drop('c')

In [None]:
df2

In [None]:
df2.drop('c', axis=0)

In [None]:
df3 = df2.drop('2nd_language', axis=1)

In [None]:
df3

# Remember the df.copy() *issue*

In [None]:
df4 = df3

In [None]:
df4.drop(['a','b'],inplace=True)

In [None]:
df4

In [None]:
df3

In [None]:
df3 = df2.copy()
df3

In [None]:
df3.drop('capital', axis=1, inplace=True)
df3

In [None]:
df2

#========================================================================

### Indexing, selection, and filtering

The key here is that we can build boolean Series that we can use to index the original Series or DataFrame. Those booleans can be combined with bitwise boolean operators (&, |, ~) to get filters that are as complex as we need. 

In [None]:
s3

In [None]:
s3[['V', 'M']]

In [None]:
s3[2:]

In [None]:
s3['P':'V']

In [None]:
s3 > 1e06

In [None]:
s3[s3>1e06]

In [None]:
df3

In [None]:
df3[df3['year'] > 1950]

In [None]:
df3[(df3['year'] > 1900) | (df3['debt'] > 1)]

In [None]:
recent = df3['year'] > 1900
indebted = df3['debt'] > 1

df3[recent & indebted]

In [None]:
df3[df3['year'] > 1900][df3['debt'] > 1]

### Function application and mapping

Function application and mapping allows us to modify the elements of a DataFrame (columns with apply or elements with applymap) without for loops. This way we are not constrained to the functions already implemented by pandas or numpy.

In [None]:
df3.drop('kk',inplace=True,axis=1)

In [None]:
df3

In [None]:
np.sqrt(df3['population'])

In [None]:
df4 = pd.DataFrame(np.random.randn(4,3) * 17 + 15, columns=list('bde'), index=list('BMPZ'))
df4

In [None]:
np.abs(df4)

This is a typical use case for lambdas (anonymous functions)

In [None]:
def my_range(series):
    return series.max()-series.min()

In [None]:
def min_max(series):
    return [series.min(),series.max()]

The function for "apply" is expecting a series as input and an object as output.

In [None]:
df4.apply(my_range)

In [None]:
df4.apply(lambda col: col.max() - col.min())

In [None]:
df4.applymap(lambda element: element % 10 )

In [None]:
df4.apply(lambda series: series.max() - series.min(), axis=1)

In [None]:
def f(series):
    return pd.Series([series.max(), series.min()], index=['max', 'min'])

df4.apply(f)

In [None]:
df4.apply(min_max)

In [None]:
for item in df4.items():
    print(item)

In [None]:
for item in df4.iteritems():
    print(item)

In [None]:
map(f, [1,2])

In [None]:
def format_2digits(number):
    return '%.2f' % number

In [None]:
df4.applymap(format_2digits)

In [None]:
df4

### Sorting and ranking

In [None]:
df4.sort_index(ascending=False)

In [None]:
df4.sort_index(ascending=False, axis=1)

In [None]:
df4.sort_values(by='e')

In [None]:
df4.sort_values(by=['e','b'])

In [None]:
s1 = pd.Series([2,3,8,4,3,2,1], index=list('abcdefg'))
s1

In [None]:
s1.sort_values()

rank() returns the positions of the elements of the Series in its sorted version. If there are ties, it will take averages.

In [None]:
s1.rank()

method{‘average’, ‘min’, ‘max’, ‘first’, ‘dense’}, default ‘average’
How to rank the group of records that have the same value (i.e. ties):

average: average rank of the group

min: lowest rank in the group

max: highest rank in the group

first: ranks assigned in order they appear in the array

dense: like ‘min’, but rank always increases by 1 between groups.

In [None]:
help(pd.Series([1,1,1]).rank())

In [None]:
pd.Series([1,1,1]).rank(method='first')

In [None]:
pd.Series([1,1,1]).rank()

In [None]:
s2 = pd.Series([30,10,20], index=list('abc'))
s2

In [None]:
s2.rank()

#### Exercise

Write a function that takes a Series and returns the top 10% registers. In this case, earners. Test it with this Series:

```python
salaries = pd.Series([150000, 90000, 120000,30000,10000,5000,40000, 50000, 80000, 35000, 27000,14000, 28000, 22000,25000])
```

## Summarizing and computing descriptive statistics

In [None]:
x = pd.Series([1.2, np.nan, 4, np.nan, 9], index=list('abcde'))
y = pd.Series([5, 3, 7, np.nan, 14], index=list('abcde'))

df = pd.DataFrame([x, y], index=['x','y']).T
df

In [None]:
df.sum()

As with many methods, we can use them in the direction perpendicular to their default.

In [None]:
df.sum(axis=1)

In [None]:
df.sum(axis=1, skipna=False)

In [None]:
df.mean()

In [None]:
df.mean(axis=1)

In [None]:
df.cumsum()

In [None]:
df.std()

In [None]:
df.describe()

In [None]:
df['x'].sum()

In [None]:
df['x'].describe()

### Unique values, value counts, and membership

In [None]:
s7 = pd.Series(list('gtcaaagcttcga'))
s7

In [None]:
s7.unique()

In [None]:
s7.value_counts()

In [None]:
puric_bases = ['a','g']
s7.isin(puric_bases)

In [None]:
s7[s7.isin(puric_bases)]

## Handling missing data

In [None]:
string_data = pd.Series(['Ma', 'Lu', 'Ca', 'Va', np.nan])
string_data

In [None]:
string_data[string_data!=np.nan]

This is weird... but it has some really good reasons. You can find explanations [here](https://stackoverflow.com/questions/10034149/why-is-nan-not-equal-to-nan) and [here](https://stackoverflow.com/questions/1565164/what-is-the-rationale-for-all-comparisons-returning-false-for-ieee754-nan-values)

In [None]:
a = np.nan
a==a

In [None]:
np.nan == np.nan

In [None]:
string_data[~string_data.isnull()]

### Filtering out missing data

In [None]:
string_data[string_data.notnull()]

In [None]:
df5 = pd.DataFrame([[1,2,3], 
                    [np.nan, 8, 7], 
                    [4, np.nan, 90], 
                    [67,42,53]], 
                   columns=list('abc'))
df5

In [None]:
df5[df5['a'].notnull()]

In [None]:
df5.notnull()

any() and all() are functions of boolean Series. They reduce the Series to a single boolean value by applying repeatedly the operators "or" and "and", respectively.

In [None]:
df5.notnull().any()

In [None]:
df5.notnull().all()

In [None]:
df5.isnull().any()

In [None]:
df5.dropna()

In [None]:
df5

In [None]:
df5.dropna(axis=1)

In [None]:
array = np.random.randn(8,3) * 20 + 100

df6 = pd.DataFrame(array, columns=list('xyz'), index=list('abcdefgh'))
df6.iloc[2:5, 1] = np.nan
df6.iloc[1:3, 2] = np.nan
df6

The thresh argument specifies the minimum number of non-null values required to keep a column (or row, with axis=1)

In [None]:
df6.dropna(thresh=2)

In [None]:
df6.dropna(thresh=2, axis=1)

In [None]:
df6.dropna(thresh=6, axis=1)

### Filling in missing data

In [None]:
df6.fillna(0)

In [None]:
df6.fillna({'x' : 100, 'y' : 50, 'z' : 20})

In [None]:
df6

In [None]:
df6.fillna(method='ffill')

In [None]:
df6.fillna(method='ffill')

In [None]:
df6.fillna(df6.median())

In [None]:
df6.median()

# Additional References

[Python for Data Analysis](http://shop.oreilly.com/product/0636920023784.do)

