Pandas is best desgiend to use with tabular or heterogeneous data. Numpy is suitable to use with homogenous (one-type) data. 

Using Numpy functions on pandas object will preserve index-value link

### Series

In [3]:
import pandas as pd

series = pd.Series([4,7,-5,3], index = ['a','b','c','d'])
series[['a','b']]
series[series > 0]

a    4
b    7
d    3
dtype: int64

In [9]:
# Can create a series directly by passing in a dictionary

sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
series_2 = pd.Series(sdata)

series_2.name = 'Population'
series_2.index.name = 'states'

In [12]:
# Series automatically align index between two series when doing Operations 

### DataFrame

In [None]:
# One of the most common way to create DataFrame is from a dictionary of EQUAL-LENGTH lists:

data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame = pd.DataFrame(data)

In [27]:
# Choosing a column in dataframe is equivalent to retrieving a Series

frame['year']
frame.year

# Choosing a row
frame.loc[3]


In [33]:
# Conditoning to create column
frame['Western States'] = frame['state'] == 'Nevada'
frame['debt'] = frame['Western States']

# Column Removal
del frame['debt']
frame

Unnamed: 0,pop,state,year,Western States
0,1.5,Ohio,2000,False
1,1.7,Ohio,2001,False
2,3.6,Ohio,2002,False
3,2.4,Nevada,2001,True
4,2.9,Nevada,2002,True
5,3.2,Nevada,2003,True


# Essential Functionality

### Reindexing

In [46]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj2 = obj.reindex(['a','b','c','d','e']) # Rearrange the Index

### Dropping Entries from Axes

In [52]:
import numpy as np
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

new_obj = data.drop(['Ohio','Utah'])
new_obj = data.drop(['one'], axis = 1) # Specify axis = 1, if wanting to drop columns
data.drop(['Colorado'], inplace = True) # inplace = True will drop the specified, without creating a new object! 


### Indexing/Selection/Filtering

In [68]:
# Series

obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj[2:4] # Chooses the 3rd to 4th elemnts
obj[['b','d']]
obj['b':'d']
obj[obj >= 3]  # Conditioning 


# DataFrame
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])


data[['three','two']]
data[1:3] # 2nd to 3rd elements are included
data[data['two'] > 5] # Select all rows where their columns values fit the criteria

data.loc['New York',['one','two']] # Selecting particular conditional on rows and columns
data.iloc[3,[0,1]] # Similar but replaced with numerical value using iloc

one    12
two    13
Name: New York, dtype: int32

### Arithmetic Operations with Fill Values

In [73]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),columns=list('abcde'))

# Straight adding will create NaN for all mismatched column/row values
# Hence, use ADD method

df1.add(df2, fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


### Arithmetic between Series and DataFrame

In [100]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
.....: columns=list('bde'),
.....: index=['Utah', 'Ohio', 'Texas', 'Oregon'])

series = frame.iloc[0]

# By default, index of the Series matches the columns of the DataFrame, broadcasting down all rows
frame - series

# If want to match index of the Series on the index of DataFrame, and broadcast on the columns:
series = frame['b']
frame.sub(series, axis = 'index')

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,0.0,1.0,2.0
Texas,0.0,1.0,2.0
Oregon,0.0,1.0,2.0


### Function Application and Mapping

In [107]:
# .apply function can be used with a function
f = lambda x: x.mean() - x.min()
frame.apply(f) # This will be applied to each column
frame.apply(f, axis = 'columns')  # This will apply to each row

# Elementwise application
format = lambda x: '%.2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


### Sorting and Ranking

In [112]:
frame.sort_index(axis = 1, ascending = False)
frame.sort_values(by = ['b','e']) # Just sorting by one column

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [116]:
frame.rank(axis = 'rows')

Unnamed: 0,b,d,e
Utah,1.0,1.0,1.0
Ohio,2.0,2.0,2.0
Texas,3.0,3.0,3.0
Oregon,4.0,4.0,4.0


# Summarizing/Computing Descriptive Stats

In [125]:
frame.mean(axis = 'columns', skipna = True) # na values is be default skipped 
frame.idxmax() # Gives the location of the max
frame.idxmin() # Gives the location of the min

frame.describe() # Template summary of stats

frame.corr()
frame.cov()

Unnamed: 0,b,d,e
b,15.0,15.0,15.0
d,15.0,15.0,15.0
e,15.0,15.0,15.0


### Unique Values, Value Counts, and Memeberships

In [130]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj.unique() # Output all unique values

obj.value_counts() # Outputs counts of all unique values

obj.isin(['c','a']) # Determine whether the elemtent is in the given list

0     True
1     True
2    False
3     True
4     True
5    False
6    False
7     True
8     True
dtype: bool