# pandas

In [1]:
import numpy as np

import pandas as pd
from pandas import Series, DataFrame

## Series

In [2]:
s = Series([4, 7, 5, 3])
s

In [3]:
s.values

In [4]:
s.index

In [5]:
s.index = [1, 2, 3, 4] # naming index manually
s

In [6]:
s = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c']) # specifying index
s

In [7]:
s['a'], s['d']

In [8]:
s[s > 0] # filtering

In [9]:
s * 2

In [10]:
np.exp(s)

In [11]:
4 in s

In [12]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

s1 = Series(sdata) # using dict to create a Series
s1

In [13]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
s2 = Series(sdata, index=states) # mapping the prev s with new states (notice that California is NaN since before it was not in s)
s2

In [14]:
pd.isnull(s) # detect missing datas

In [15]:
pd.notnull(s)

In [16]:
s1 + s2 # join

In [17]:
s2.name = 'population'
s2

In [18]:
s2.index.name = 'state'
s2

In [19]:
s

## DataFrame

In [20]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = DataFrame(data)
frame

In [21]:
frame.shape

In [22]:
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt']) # specify each columns placement
frame2

In [23]:
frame2.columns

In [24]:
frame2['state'], frame.state

In [25]:
frame.loc[0] # 0: index, .loc[0] gets the row of the first index

In [26]:
frame2['debt'] = 16.5 # fill all values with 16.5
frame2

In [27]:
frame2['debt'] = np.arange(frame2.shape[0])
frame2

In [28]:
val = pd.Series([-1.2, -1.5, -1.7], index=[1, 3, 4]) # adding Series to DataFrame
frame2['debt'] = val # this will only give value for apporpriate indexes
frame2

In [29]:
frame2['eastern'] = frame2.state == 'Ohio'
frame2

In [30]:
del frame2['eastern'] # dropping a column

In [31]:
frame2.columns

In [32]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}} # nested dictionaries
frame3 = DataFrame(pop) # outer keys as columns, inner keys as rows
frame3

In [33]:
frame3.T # transpose

In [34]:
pd.DataFrame(pop, index=[2001, 2002, 2003])

In [35]:
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]} # can also define like this
pd.DataFrame(pdata)

## Essential Functionality

Fundamental mechanics of interacting with the data contained in a Series or DataFrame.

### Reindexing

In [36]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

In [37]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e']) # reformat the index, if added more then NaN is the value
obj2

In [38]:
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

In [39]:
obj3.reindex(range(6), method='ffill') # forward fills enables to replace missing values with the data index before

In [40]:
frame = DataFrame(np.arange(9).reshape((3, 3)),
                  index=['a', 'c', 'd'],
                  columns=['Ohio', 'Texas', 'California'])
frame

In [41]:
frame.reindex(['a', 'b', 'c', 'd'])

In [42]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

### Dropping Entries from an Axis

In [43]:
obj = Series(np.arange(5), index=['a', 'b', 'c', 'd', 'e'])
obj

In [44]:
new_obj = obj.drop('c')
new_obj # new obj dataframe without index 'c'

In [45]:
obj.drop(['d', 'c']) # dropping multiple indexes

In [46]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

In [47]:
data.drop(['Colorado', 'Ohio']) # default drop refers to axis=0 (means that you only can remove rows)

In [48]:
data.drop('two', axis=1) # axis=1 refers to the column, can also do axis='columns'

In [49]:
obj.drop('c', inplace=True) # manipulate the object in-place
obj

### Indexing, Selection, and Filtering

In [50]:
obj = Series(np.arange(4), index=['a', 'b', 'c', 'd'])
obj

In [51]:
obj[1]

In [52]:
obj[2:4]

In [53]:
obj[['b', 'a', 'd']] # index filtering using strings (according to index data type)

In [54]:
obj[[1, 3]] # index filtering

In [55]:
obj[obj < 2] # filtering values less than 2

In [56]:
obj['b':'c'] # can also do string indexing (according to index data type)

In [57]:
obj['b':'c'] = 5 # replace values with 5
obj

In [58]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

In [59]:
data['two']

In [60]:
data[['three', 'one']]

In [61]:
data[:2]

In [62]:
data[data['three'] > 5] # the column 3 with value more than 5

In [63]:
data < 5

In [64]:
data[data < 5] = 0
data

#### Selection with `loc` and `iloc`

`loc` refers to indexing using strings, this applies if the index type is a string.

`iloc` refers to indexing using integers only.

In [65]:
data.loc['Colorado'] # filtering for index Colorado

In [66]:
data.loc['Colorado', ['two', 'three']] # filtering for index Colorado specifically for 'two' and 'three'

In [67]:
data.iloc[2]

In [68]:
data.iloc[[1, 2]] # get data on index 1, 2

In [69]:
data.iloc[[1, 2], [0, 1]] # get data on index 1st and 2nd with its 1st and 2nd columns

In [70]:
data.loc[:'Utah', 'two'] # get first 2 data specifically on column 'two'

In [71]:
data.iloc[:, :2] # get all data with its first 2 columns

In [72]:
data.iloc[:, :2][data.two > 5] # filtering the data with its first 2 columns where its column 'two' > 5

### Integer Indexes

In [73]:
ser = Series(np.arange(3))
ser

In [74]:
ser.loc[:1]

### Arithmetic and Data Alignment

In [75]:
s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
s1, s2

In [76]:
s1 + s2

In [77]:
df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                index=['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [78]:
df1

In [79]:
df2

In [80]:
df1 + df2 # results in a join operation, if one of the column does not exist in either each DatFrame then NaN is its value

In [81]:
df1 = DataFrame({'A': [1, 2]})
df2 = DataFrame({'B': [3, 4]})

In [82]:
df1

In [83]:
df2

In [84]:
df1 - df2 # if one of the column does not exist in either each DatFrame then NaN is its value

#### Arithmetic methods with fill values

In [85]:
df1 = DataFrame(np.arange(12.).reshape((3, 4)),
                columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4, 5)),
                columns=list('abcde'))

In [86]:
df1

In [87]:
df2

In [88]:
df2.loc[1] # gets the first index values and its value for each corresponding columns

In [89]:
df2.loc[1, 'b'] # gets the first index values and its value for column 'b'

In [90]:
df2.loc[1, 'b'] = np.nan

In [91]:
df2

In [92]:
df1 + df2

In [93]:
df1.add(df2, fill_value=0) # fill_value=0 results in a full join operation, either one of the rows are missing or columns, will be replaced by the dataframe which has its rows or column then adds the value according to fill_value 

In [94]:
1 / df1

In [95]:
df1.rdiv(1)

In [96]:
df1.reindex(columns=df2.columns, fill_value=None)

In [97]:
df1.reindex(columns=df2.columns, fill_value=0) # filling any new columns with `0`

#### Operations between DataFrame and Series

In [98]:
arr = np.arange(12.).reshape((3, 4))
arr

In [99]:
arr - arr[0] # this applies to all rows, often called as broadcasting

In [100]:
frame = DataFrame(np.arange(12.).reshape((4, 3)),
                  columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

In [101]:
series = frame.iloc[0]
series

In [102]:
frame - series # applies to all rows

In [103]:
series - frame

In [104]:
series2 = Series(range(3), index=['b','e','f'])
series2

In [105]:
frame + series2

In [106]:
series3 = frame['d']
series3

In [107]:
frame

In [108]:
frame.sub(series3, axis=0) # substracting over the columns on each matching rows

In [109]:
frame.sub(series3, axis='index')

### Function Application and Mapping

In [110]:
frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

In [111]:
np.abs(frame) # can apply numpy's ufuncs to pandas objects

In [112]:
f = lambda x: x.max() - x.min()
frame.apply(f) # applies to each columns

In [113]:
frame.apply(f, axis=1) # applies to each rows

In [114]:
def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

In [115]:
format = lambda x: '%.2f' % x
frame.applymap(format) # applies to each value in DataFrame

In [116]:
frame['e'].map(format) # Series has a map method

### Sorting and Ranking

In [117]:
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
obj

In [118]:
obj.sort_index() # sort the index

In [119]:
frame = DataFrame(np.arange(8).reshape((2, 4)),
                  index=['three', 'one'],
                  columns=['d', 'a', 'b', 'c'])
frame

In [120]:
frame.sort_index(axis=1) # sort the columns

In [121]:
frame.sort_index(axis=1, ascending=False) # sort the columns in descending order

In [122]:
obj = Series([4, 7, -3, 2])
obj.sort_values()

In [123]:
obj = Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values() # any missing values are sorted to the end of the Series

In [124]:
frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

In [125]:
frame.sort_values(by='b') # only sort column 'b'

In [126]:
frame.sort_values(by=['a', 'b']) # sort multiple columns

In [127]:
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj

In [128]:
obj.rank() # rank breaks ties by assigning each group the mean rank

In [129]:
obj.rank(method='first') # argsort? (returns the index of the element)

In [130]:
obj.rank(ascending=False, method='max')

In [131]:
frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                  'c': [-2, 5, 8, -2.5]})
frame

In [132]:
frame.rank(axis=0)

### Axis Indexes with Duplicate Labels

In [133]:
obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj

In [134]:
obj.index.is_unique # whether its labels are unique or not

In [135]:
obj['a'] # returns a Series since it has multiple labels

In [136]:
obj['c'] # scalar

In [137]:
df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df

In [138]:
df.loc['b']

## Summarizing and Computing Descriptive Statistics

In [139]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], 
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

In [140]:
df.sum() # pandas automatically handles NaN values

In [141]:
df.sum(axis='columns') # sums across columns

In [142]:
df.mean(axis='columns', skipna=False)

In [143]:
df.idxmax() # index of the max value

In [144]:
df.idxmin() # index of the min value

In [145]:
df.cumsum()

In [146]:
df.describe() # one of the most used functions in pandas

In [147]:
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj

In [148]:
obj.describe() # can operate with non-numerical data

- **count**: total values
- **unique**: unique values
- **top**: most seen value
- **freq**: frequency of most seen value

### Correlation and Covariance

In [150]:
import pandas_datareader.data as web
all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

In [151]:
price = DataFrame({ticker: data['Adj Close'] for ticker, data in all_data.items()})
volume = DataFrame({ticker: data['Volume'] for ticker, data in all_data.items()})

In [152]:
returns = price.pct_change()
returns.tail()

In [153]:
returns['MSFT'].corr(returns['IBM'])

In [154]:
returns['MSFT'].cov(returns['IBM'])

In [155]:
returns.MSFT.corr(returns.IBM)

In [156]:
returns.corr() # returns corr of all columns

In [157]:
returns.cov() # returns covariance of all columns

In [158]:
returns.corrwith(volume)

### Unique Values, Value Counts, and Membership

In [159]:
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [160]:
uniques = obj.unique()
uniques

In [161]:
obj.value_counts()

In [162]:
pd.value_counts(obj.values, sort=False)

In [164]:
mask = obj.isin(['b', 'c'])
mask

In [165]:
obj[mask]

In [167]:
to_match = Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = Series(['c', 'b', 'a'])

In [168]:
pd.Index(unique_vals).get_indexer(to_match) # get index of unique vals of each value then return the index for each value in to_match

In [169]:
data = DataFrame({'Qu1': [1, 3, 4, 3, 4],
                  'Qu2': [2, 3, 1, 2, 3],
                  'Qu3': [1, 5, 2, 4, 4]})
data

In [171]:
result = data.apply(pd.value_counts).fillna(0)
result