# DataFrames

1.DataFrames are the workhorse of pandas and are directly inspired by the R programming language.

2.We can think of a DataFrame as a bunch of Series objects put together to share the same index. 

In [1]:
import pandas as pd
import numpy as np

In [2]:
from numpy.random import randn


In [6]:
randn(5,4)

array([[-2.10386175,  0.3970059 , -0.24288217,  1.43166483],
       [-0.62481928, -0.13529142, -1.26335692, -2.03840328],
       [ 0.56093431, -1.24462629, -1.32430173,  0.08794165],
       [-1.31434929, -1.07403785,  0.64358485, -0.67655932],
       [ 0.69987926, -0.24381866, -1.96095245, -0.45836922]])

In [10]:
df = pd.DataFrame(randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())

In [11]:
df

Unnamed: 0,W,X,Y,Z
A,0.047331,-0.551598,1.114553,0.375924
B,0.141504,-1.077288,0.415132,-0.204169
C,1.305683,0.241047,-0.11444,0.058948
D,-0.476705,-1.330072,-0.103058,-0.82669
E,0.299918,-1.318847,0.696232,0.216285


## Selection and Indexing

methods to grab data from a DataFrame

In [12]:
df['W']

A    0.047331
B    0.141504
C    1.305683
D   -0.476705
E    0.299918
Name: W, dtype: float64

In [13]:
# Pass a list of column names
df[['W','Z']]

Unnamed: 0,W,Z
A,0.047331,0.375924
B,0.141504,-0.204169
C,1.305683,0.058948
D,-0.476705,-0.82669
E,0.299918,0.216285


DataFrame Columns are just Series

In [16]:
type(df['W'])

pandas.core.frame.DataFrame

**Creating a new column:**

In [10]:
df['new'] = df['W'] + df['Y']

In [11]:
df

Unnamed: 0,W,X,Y,Z,new
A,-1.520263,-1.464931,0.366263,0.946617,-1.154
B,1.259568,-1.294068,0.703789,-0.119818,1.963357
C,1.510955,-0.877837,1.007593,-1.112576,2.518548
D,1.856592,-1.245548,1.367368,-0.646653,3.22396
E,0.285286,-0.545648,-1.153745,0.38553,-0.868459


** Removing Columns**

In [12]:
df.drop('new',axis=1)

Unnamed: 0,W,X,Y,Z
A,-1.520263,-1.464931,0.366263,0.946617
B,1.259568,-1.294068,0.703789,-0.119818
C,1.510955,-0.877837,1.007593,-1.112576
D,1.856592,-1.245548,1.367368,-0.646653
E,0.285286,-0.545648,-1.153745,0.38553


In [13]:
# Not inplace unless specified!
df

Unnamed: 0,W,X,Y,Z,new
A,-1.520263,-1.464931,0.366263,0.946617,-1.154
B,1.259568,-1.294068,0.703789,-0.119818,1.963357
C,1.510955,-0.877837,1.007593,-1.112576,2.518548
D,1.856592,-1.245548,1.367368,-0.646653,3.22396
E,0.285286,-0.545648,-1.153745,0.38553,-0.868459


In [14]:
df.drop('new',axis=1,inplace=True)

In [15]:
df

Unnamed: 0,W,X,Y,Z
A,-1.520263,-1.464931,0.366263,0.946617
B,1.259568,-1.294068,0.703789,-0.119818
C,1.510955,-0.877837,1.007593,-1.112576
D,1.856592,-1.245548,1.367368,-0.646653
E,0.285286,-0.545648,-1.153745,0.38553


Can also drop rows this way:

In [16]:
df.drop('E',axis=0)

Unnamed: 0,W,X,Y,Z
A,-1.520263,-1.464931,0.366263,0.946617
B,1.259568,-1.294068,0.703789,-0.119818
C,1.510955,-0.877837,1.007593,-1.112576
D,1.856592,-1.245548,1.367368,-0.646653


** Selecting Rows**

In [17]:
df.loc['A']

W   -1.520263
X   -1.464931
Y    0.366263
Z    0.946617
Name: A, dtype: float64

Or select based off of position instead of label 

In [18]:
df.iloc[2]

W    1.510955
X   -0.877837
Y    1.007593
Z   -1.112576
Name: C, dtype: float64

** Selecting subset of rows and columns **

In [21]:
df.loc['B','Y']

0.7037888547822624

In [25]:
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,-0.522053,-0.497294
B,0.778327,1.204806


### Conditional Selection

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [22]:
df

Unnamed: 0,W,X,Y,Z
A,-1.520263,-1.464931,0.366263,0.946617
B,1.259568,-1.294068,0.703789,-0.119818
C,1.510955,-0.877837,1.007593,-1.112576
D,1.856592,-1.245548,1.367368,-0.646653
E,0.285286,-0.545648,-1.153745,0.38553


In [23]:
df>0

Unnamed: 0,W,X,Y,Z
A,False,False,True,True
B,True,False,True,False
C,True,False,True,False
D,True,False,True,False
E,True,False,False,True


In [24]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,,,0.366263,0.946617
B,1.259568,,0.703789,
C,1.510955,,1.007593,
D,1.856592,,1.367368,
E,0.285286,,,0.38553


In [25]:
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
B,1.259568,-1.294068,0.703789,-0.119818
C,1.510955,-0.877837,1.007593,-1.112576
D,1.856592,-1.245548,1.367368,-0.646653
E,0.285286,-0.545648,-1.153745,0.38553


In [34]:
df[df['W']>0]['Y']

B    1.204806
D    0.367024
Name: Y, dtype: float64

In [35]:
df[df['W']>0][['Y','X']]

Unnamed: 0,Y,X
B,1.204806,0.071377
D,0.367024,-1.441997


For two conditions you can use | and & with parenthesis:

In [26]:
df[(df['W']>0) & (df['Y'] > 1)]

Unnamed: 0,W,X,Y,Z
C,1.510955,-0.877837,1.007593,-1.112576
D,1.856592,-1.245548,1.367368,-0.646653


## More Index Details

More features of indexing, including resetting the index or setting it something else. 

In [27]:
df

Unnamed: 0,W,X,Y,Z
A,-1.520263,-1.464931,0.366263,0.946617
B,1.259568,-1.294068,0.703789,-0.119818
C,1.510955,-0.877837,1.007593,-1.112576
D,1.856592,-1.245548,1.367368,-0.646653
E,0.285286,-0.545648,-1.153745,0.38553


In [38]:
# Reset to default 0,1...n index
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,-0.522053,1.211617,-0.497294,0.333873
1,B,0.778327,0.071377,1.204806,-1.034801
2,C,-0.293874,-0.552269,-0.904613,-0.629399
3,D,0.110884,-1.441997,0.367024,0.610949
4,E,-0.627954,-1.44863,-0.074358,1.02559


In [45]:
newind = 'CA NY WY OR CO'.split()

In [46]:
newind

['CA', 'NY', 'WY', 'OR', 'CO']

In [47]:
df['States'] = newind

In [48]:
df

Unnamed: 0,W,X,Y,Z,States
A,-0.522053,1.211617,-0.497294,0.333873,CA
B,0.778327,0.071377,1.204806,-1.034801,NY
C,-0.293874,-0.552269,-0.904613,-0.629399,WY
D,0.110884,-1.441997,0.367024,0.610949,OR
E,-0.627954,-1.44863,-0.074358,1.02559,CO


In [49]:
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,-0.522053,1.211617,-0.497294,0.333873
NY,0.778327,0.071377,1.204806,-1.034801
WY,-0.293874,-0.552269,-0.904613,-0.629399
OR,0.110884,-1.441997,0.367024,0.610949
CO,-0.627954,-1.44863,-0.074358,1.02559


In [50]:
df

Unnamed: 0,W,X,Y,Z,States
A,-0.522053,1.211617,-0.497294,0.333873,CA
B,0.778327,0.071377,1.204806,-1.034801,NY
C,-0.293874,-0.552269,-0.904613,-0.629399,WY
D,0.110884,-1.441997,0.367024,0.610949,OR
E,-0.627954,-1.44863,-0.074358,1.02559,CO


In [51]:
df.set_index('States',inplace=True)

In [52]:
df

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,-0.522053,1.211617,-0.497294,0.333873
NY,0.778327,0.071377,1.204806,-1.034801
WY,-0.293874,-0.552269,-0.904613,-0.629399
OR,0.110884,-1.441997,0.367024,0.610949
CO,-0.627954,-1.44863,-0.074358,1.02559


# END

In [45]:
import numpy as np

In [2]:
a=np.array([[1,3,4],[9,0,1],[1,0,3]])

In [3]:
a

array([[1, 3, 4],
       [9, 0, 1],
       [1, 0, 3]])

In [4]:
a==1

array([[ True, False, False],
       [False, False,  True],
       [ True, False, False]])

In [8]:
a[a==1]=22

In [10]:
a

array([[22,  3,  4],
       [ 9,  0, 22],
       [22,  0,  3]])

In [46]:
df.iloc[1]

W    0.695816
X    0.092475
Y    0.261639
Z   -1.270541
Name: B, dtype: float64

In [48]:
pd.factorize(df.iloc[1])

(array([0, 1, 2, 3], dtype=int64),
 Float64Index([ 0.6958157051790522, 0.09247488583114723, 0.26163913077544754,
               -1.2705407363402816],
              dtype='float64'))

In [49]:
d1={"Gender":['M','F','M','M','F'],"Age":[10,23,22,12,33]}