In [72]:
![]()

'[]' is not recognized as an internal or external command,
operable program or batch file.


In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Object Creation

Creating a Series by passing a list of values, letting pandas create a default integer index:

In [3]:
s = pd.Series([1,3,5,np.nan,6,8])
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [7]:
s = pd.Series([3, -5, 7, 4], index=['a', 'b', 'c', 'd'])
print(s)

a    3
b   -5
c    7
d    4
dtype: int64


In [9]:
# Get one element
s['b']

-5

In [11]:
s[~(s > 1)]

b   -5
dtype: int64

In [13]:
# Series s where value is not >1 s where value is <-1 or >2
s[(s < -1) | (s > 2)]

a    3
b   -5
c    7
d    4
dtype: int64

In [15]:
# Set index a of Series s to 6
s['a'] = 6
print(s)

a    6
b   -5
c    7
d    4
dtype: int64


### Dropping

Drop values from rows (axis=0)

In [17]:
s.drop(['a', 'c'])

b   -5
d    4
dtype: int64

In [19]:
s3 = pd.Series([7, -2, 3], index=['a', 'c', 'd'])
print(s3)

a    7
c   -2
d    3
dtype: int64


#### Arithmetic Operations with Fill Methods

In [22]:
s + s3

a    13.0
b     NaN
c     5.0
d     7.0
dtype: float64

In [23]:
Add =s.add(s3, fill_value=0)
sub = s.sub(s3, fill_value=2)
div = s.div(s3, fill_value=4)
mul = s.mul(s3, fill_value=3)
print(Add,sub,mul,div)

a    13.0
b    -5.0
c     5.0
d     7.0
dtype: float64 a   -1.0
b   -7.0
c    9.0
d    1.0
dtype: float64 a    42.0
b   -15.0
c   -14.0
d    12.0
dtype: float64 a    0.857143
b   -1.250000
c   -3.500000
d    1.333333
dtype: float64


## DataFrame 

#### Creating a DataFrame by passing a numpy array, with a datetime index and labeled columns:

In [24]:
dates = pd.date_range('20180505', periods=6)
print(dates)

DatetimeIndex(['2018-05-05', '2018-05-06', '2018-05-07', '2018-05-08',
               '2018-05-09', '2018-05-10'],
              dtype='datetime64[ns]', freq='D')


In [28]:
f = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
print(f)

                   A         B         C         D
2018-05-05 -1.283553  0.236292 -0.489875 -0.095820
2018-05-06  0.818262  0.885781  1.479090 -0.668281
2018-05-07  0.036093  0.719880  1.105793  1.319583
2018-05-08  0.001316  0.056100  0.502117  1.224963
2018-05-09 -0.182507  0.442952 -0.389837 -1.382295
2018-05-10 -0.666535  0.002389 -0.717917  0.315128


In [10]:
df2 = pd.DataFrame({ 'A' : 1.,
                        'B' : pd.Timestamp('20180505'),
                        'C' : pd.Series(12,index=list(range(4)),dtype='float32'),
                        'D' : np.array([3] * 4,dtype='int32'),
                        'E' : pd.Categorical(["test","train","test","train"]),
                        'F' : 'foo' })
print(df2)

     A          B     C  D      E    F
0  1.0 2018-05-05  12.0  3   test  foo
1  1.0 2018-05-05  12.0  3  train  foo
2  1.0 2018-05-05  12.0  3   test  foo
3  1.0 2018-05-05  12.0  3  train  foo


In [15]:
df2.dtypes
#np.dtype(df2)

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

Creating a DataFrame by passing a dict of objects that can be converted to series-like.

In [26]:
data = {'Country': ['Belgium', 'India', 'Brazil'],
'Capital': ['Brussels', 'New Delhi', 'Brasília'],
'Population': [11190846, 1303171035, 207847528]}
print(data)

{'Country': ['Belgium', 'India', 'Brazil'], 'Capital': ['Brussels', 'New Delhi', 'Brasília'], 'Population': [11190846, 1303171035, 207847528]}


In [27]:
df = pd.DataFrame(data,columns=['Country', 'Capital', 'Population'])
print(df)

Unnamed: 0,Country,Capital,Population
0,Belgium,Brussels,11190846
1,India,New Delhi,1303171035
2,Brazil,Brasília,207847528


In [32]:
# Selection

# Get subset of a DataFrame 
df[1:]

Unnamed: 0,Country,Capital,Population
1,India,New Delhi,1303171035
2,Brazil,Brasília,207847528


In [34]:
# By Position
d = df.iloc[[0],[0]] # Select single value by row &column
print(d)
df.iat[0,0]

   Country
0  Belgium


'Belgium'

Select single value by row & column labels

In [36]:
# By Label
df.loc[[0], ['Country']]

Unnamed: 0,Country
0,Belgium


In [38]:
df.at[0,'Country']

'Belgium'

Select single row of subset of rows

In [40]:
# By Label/Position
df.ix[2]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Country          Brazil
Capital        Brasília
Population    207847528
Name: 2, dtype: object

In [42]:
df.ix[:,'Capital'] #Select a single column ofsubset of columns

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


0     Brussels
1    New Delhi
2     Brasília
Name: Capital, dtype: object

In [43]:
df.ix[1,'Capital'] #Select rows and columns

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


'New Delhi'

### Boolean Indexing

In [45]:
df[df['Population']>1200000000]

Unnamed: 0,Country,Capital,Population
1,India,New Delhi,1303171035


### Dropping Data Frame

In [48]:
df.drop('Country', axis=1) # Drop values from columns(axis=1)

Unnamed: 0,Capital,Population
0,Brussels,11190846
1,New Delhi,1303171035
2,Brasília,207847528


In [50]:
df

Unnamed: 0,Country,Capital,Population
0,Belgium,Brussels,11190846
1,India,New Delhi,1303171035
2,Brazil,Brasília,207847528


In [52]:
df.sort_index() #Sort by labels along an axis

Unnamed: 0,Country,Capital,Population
0,Belgium,Brussels,11190846
1,India,New Delhi,1303171035
2,Brazil,Brasília,207847528


In [54]:
df.sort_values(by='Country') #Sort by the values along an axis

Unnamed: 0,Country,Capital,Population
0,Belgium,Brussels,11190846
2,Brazil,Brasília,207847528
1,India,New Delhi,1303171035


In [56]:
df.rank() #Assign ranks to entries

Unnamed: 0,Country,Capital,Population
0,1.0,2.0,1.0
1,3.0,3.0,3.0
2,2.0,1.0,2.0


##### Retrieving Series/DataFrame Information

In [60]:
df.shape #(rows,columns)
df.index #Describe index
df.columns #Describe DataFrame columns
df.info() #Info on DataFrame
df.count() #Number of non-NA values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
Country       3 non-null object
Capital       3 non-null object
Population    3 non-null int64
dtypes: int64(1), object(2)
memory usage: 152.0+ bytes


Country       3
Capital       3
Population    3
dtype: int64

In [70]:
print(df.sum()) #Sum of values
print(df.cumsum()) #Cummulative sum of values
print(df.min()) #Minimum
print(df.max()) #maximum values
#df.idxmin()#Minimum
#df.idxmax() #Maximum index value
print(df.describe()) #Summary statistics
print(df.mean()) #Mean of values
print(df.median()) #Median of values

Country              BelgiumIndiaBrazil
Capital       BrusselsNew DelhiBrasília
Population                   1522209409
dtype: object
              Country                    Capital  Population
0             Belgium                   Brussels    11190846
1        BelgiumIndia          BrusselsNew Delhi  1314361881
2  BelgiumIndiaBrazil  BrusselsNew DelhiBrasília  1522209409
Country        Belgium
Capital       Brasília
Population    11190846
dtype: object
Country            India
Capital        New Delhi
Population    1303171035
dtype: object
         Population
count  3.000000e+00
mean   5.074031e+08
std    6.961346e+08
min    1.119085e+07
25%    1.095192e+08
50%    2.078475e+08
75%    7.555093e+08
max    1.303171e+09
Population    5.074031e+08
dtype: float64
Population    207847528.0
dtype: float64


### Applying Functions

In [66]:
f = lambda x: x*2
df.apply(f) #Apply function

Unnamed: 0,Country,Capital,Population
0,BelgiumBelgium,BrusselsBrussels,22381692
1,IndiaIndia,New DelhiNew Delhi,2606342070
2,BrazilBrazil,BrasíliaBrasília,415695056


In [67]:
df.applymap(f) #Apply function element-wise

Unnamed: 0,Country,Capital,Population
0,BelgiumBelgium,BrusselsBrussels,22381692
1,IndiaIndia,New DelhiNew Delhi,2606342070
2,BrazilBrazil,BrasíliaBrasília,415695056
