# Pandas 

In [20]:
import pandas as pd

### Series

In [21]:
s = pd.Series([3, -5, 7, 4], index=['a', 'b', 'c', 'd'])
print(s)

a    3
b   -5
c    7
d    4
dtype: int64


### Dataframe

In [27]:
data = {'Country': ['Belgium', 'India', 'Brazil'],
        'Capital': ['Brussels', 'New Delhi', 'Brasilia'],
        'Population': [11190846, 1303171035, 207847528]}
df = pd.DataFrame(data,
                  columns=['Country', 'Capital', 'Population'])
print(df)

   Country    Capital  Population
0  Belgium   Brussels    11190846
1    India  New Delhi  1303171035
2   Brazil   Brasilia   207847528


### Dropping

In [31]:
print(s.drop(['a', 'c']))  # Drop values from rows (axis=0)

b   -5
d    4
dtype: int64


In [38]:
print(df.drop('Country', axis=1))  # Drop values from column (axis=1)

     Capital  Population
0   Brussels    11190846
1  New Delhi  1303171035
2   Brasilia   207847528


### Help 

In [33]:
help(pd.Series.loc)

Help on property:

    Access a group of rows and columns by label(s) or a boolean array.
    
    ``.loc[]`` is primarily label based, but may also be used with a
    boolean array.
    
    Allowed inputs are:
    
    - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is
      interpreted as a *label* of the index, and **never** as an
      integer position along the index).
    - A list or array of labels, e.g. ``['a', 'b', 'c']``.
    - A slice object with labels, e.g. ``'a':'f'``.
    
          start and the stop are included
    
    - A boolean array of the same length as the axis being sliced,
      e.g. ``[True, False, True]``.
    - An alignable boolean Series. The index of the key will be aligned before
      masking.
    - An alignable Index. The Index of the returned selection will be the input.
    - A ``callable`` function with one argument (the calling Series or
      DataFrame) and that returns valid output for indexing (one of the above)
    
    See more at 

### Sort & Rank

In [34]:
print(df.sort_index())  # sort by labels along an axis

   Country    Capital  Population
0  Belgium   Brussels    11190846
1    India  New Delhi  1303171035
2   Brazil   Brasilia   207847528


In [35]:
print(df.sort_values(by='Country'))  # sort by the values along on axis

   Country    Capital  Population
0  Belgium   Brussels    11190846
2   Brazil   Brasilia   207847528
1    India  New Delhi  1303171035


In [36]:
print(df.rank())  # assign ranks to entries

   Country  Capital  Population
0      1.0      2.0         1.0
1      3.0      3.0         3.0
2      2.0      1.0         2.0


### Getting

In [41]:
print(s['b'])  # get one element 
print(df[1:])  # get subset of a Dataframe

-5
  Country    Capital  Population
1   India  New Delhi  1303171035
2  Brazil   Brasilia   207847528


### Selecting, Boolean Indexing & Setting

In [48]:
# by position
print(df.iloc[0, 0])  # select single value by row & column
print(df.iat[0, 0])  

Belgium
Belgium


In [50]:
# by label
print(df.loc[0, 'Country']) # select single value by row & column labels
print(df.at[0, 'Country'])

Belgium
Belgium


In [57]:
# boolean indexing
print(s[~(s > 1)])  # series s where value is not > 1
print(s[(s < -1) | (s > 2)])  # s where value is < -1 or > 2
print(df[df['Population'] > 1200000000])  # use filter to adjust Dataframe

b   -5
dtype: int64
a    3
b   -5
c    7
d    4
dtype: int64
  Country    Capital  Population
1   India  New Delhi  1303171035


In [58]:
# setting
s['a'] = 6  # set index a of series s to 6
print(s)

a    6
b   -5
c    7
d    4
dtype: int64


### Retrieving Series/DataFrame Information

In [65]:
# basic information
print(df.shape)  # (rows, colums)
print(df.index)  # describe index
print(df.columns, "\n")  # describe DataFrame columns
print(df.info(), "\n")  # info on DataFrame
print(df.count())  # number of non-NA values

(3, 3)
RangeIndex(start=0, stop=3, step=1)
Index(['Country', 'Capital', 'Population'], dtype='object') 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Country     3 non-null      object
 1   Capital     3 non-null      object
 2   Population  3 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes
None 

Country       3
Capital       3
Population    3
dtype: int64


In [71]:
# summary
print(df.sum(), "\n")  # sum of values
print(df.cumsum(), "\n")  # cummulative sum of values
print(df.min(), "\n")  # minimum values
print(df.max(), "\n")  # maximum values

Country              BelgiumIndiaBrazil
Capital       BrusselsNew DelhiBrasilia
Population                   1522209409
dtype: object 

              Country                    Capital  Population
0             Belgium                   Brussels    11190846
1        BelgiumIndia          BrusselsNew Delhi  1314361881
2  BelgiumIndiaBrazil  BrusselsNew DelhiBrasilia  1522209409 

Country        Belgium
Capital       Brasilia
Population    11190846
dtype: object 

Country            India
Capital        New Delhi
Population    1303171035
dtype: object 



In [73]:
# summary
print(df.describe(), "\n")  # summary statistics
print(df.mean(), "\n")  # mean of values
print(df.median(), "\n")  # median of values

         Population
count  3.000000e+00
mean   5.074031e+08
std    6.961346e+08
min    1.119085e+07
25%    1.095192e+08
50%    2.078475e+08
75%    7.555093e+08
max    1.303171e+09 

Population    5.074031e+08
dtype: float64 

Population    207847528.0
dtype: float64 



### Applying Functions

In [75]:
f = lambda x: x * 2
print(df.apply(f), "\n")  # apply function
print(df.applymap(f))  # apply function element-wise

          Country             Capital  Population
0  BelgiumBelgium    BrusselsBrussels    22381692
1      IndiaIndia  New DelhiNew Delhi  2606342070
2    BrazilBrazil    BrasiliaBrasilia   415695056 

          Country             Capital  Population
0  BelgiumBelgium    BrusselsBrussels    22381692
1      IndiaIndia  New DelhiNew Delhi  2606342070
2    BrazilBrazil    BrasiliaBrasilia   415695056


### Data Alignment

In [77]:
# NA values are introduced in the indices that don't overlap:
s3 = pd.Series([7, -2, 3], index=['a', 'c', 'd'])
print(s, "\n")  # 4 columns
print(s3, "\n")  # 3 columns
print(s + s3)

a    6
b   -5
c    7
d    4
dtype: int64 

a    7
c   -2
d    3
dtype: int64 

a    13.0
b     NaN
c     5.0
d     7.0
dtype: float64


In [79]:
# internal data alignment with fill methods:
print(s.add(s3, fill_value=0))  # fill NA values with 0

a    13.0
b    -5.0
c     5.0
d     7.0
dtype: float64


In [80]:
print(s.sub(s3, fill_value=2))  # fill NA values with 2

a   -1.0
b   -7.0
c    9.0
d    1.0
dtype: float64


In [81]:
print(s.div(s3, fill_value=4))  # fill NA values with 4

a    0.857143
b   -1.250000
c   -3.500000
d    1.333333
dtype: float64


In [82]:
print(s.mul(s3, fill_value=3)) # fill NA values with 3

a    42.0
b   -15.0
c   -14.0
d    12.0
dtype: float64
