# DataFrames

1.DataFrames are the workhorse of pandas and are directly inspired by the R programming language.

2.We can think of a DataFrame as a bunch of Series objects put together to share the same index. 

In [1]:
import pandas as pd
import numpy as np
from numpy.random import randn

In [2]:
df = pd.DataFrame(randn(5,4))
df

Unnamed: 0,0,1,2,3
0,0.653533,0.57539,0.333679,-0.588364
1,-1.677524,-1.042971,1.054481,-1.286475
2,-0.611098,-1.624979,0.07234,-1.357343
3,-0.29739,1.738637,0.045619,-0.14114
4,-0.075386,0.160702,-0.761057,-1.105904


In [3]:
'W-X-Y-Z'.split("-")

['W', 'X', 'Y', 'Z']

In [4]:
df = pd.DataFrame(randn(5,4),index=['A','B','C','D' ,'E'],
                  columns='W X Y Z'.split())

In [5]:
df

Unnamed: 0,W,X,Y,Z
A,-0.767763,0.660504,0.668966,-0.463456
B,1.255684,-0.064614,0.256751,-1.808088
C,-1.002467,0.92307,0.722617,1.314495
D,-2.068051,0.985353,-0.034651,-0.089781
E,-0.544011,0.099837,-0.636896,0.293113


## Selection and Indexing

methods to grab data from a DataFrame

In [6]:
df["W"]

A   -0.767763
B    1.255684
C   -1.002467
D   -2.068051
E   -0.544011
Name: W, dtype: float64

In [7]:
print(type(df["W"]))

<class 'pandas.core.series.Series'>


In [8]:
print(df[['W',"Y"]])
type(df[['W',"Y"]])

          W         Y
A -0.767763  0.668966
B  1.255684  0.256751
C -1.002467  0.722617
D -2.068051 -0.034651
E -0.544011 -0.636896


pandas.core.frame.DataFrame

In [9]:
df

Unnamed: 0,W,X,Y,Z
A,-0.767763,0.660504,0.668966,-0.463456
B,1.255684,-0.064614,0.256751,-1.808088
C,-1.002467,0.92307,0.722617,1.314495
D,-2.068051,0.985353,-0.034651,-0.089781
E,-0.544011,0.099837,-0.636896,0.293113


In [10]:
df

Unnamed: 0,W,X,Y,Z
A,-0.767763,0.660504,0.668966,-0.463456
B,1.255684,-0.064614,0.256751,-1.808088
C,-1.002467,0.92307,0.722617,1.314495
D,-2.068051,0.985353,-0.034651,-0.089781
E,-0.544011,0.099837,-0.636896,0.293113


DataFrame Columns are just Series

In [11]:
type(df['W'])

pandas.core.series.Series

**Creating a new column:**

In [12]:
df['new'] = df['W'] + df['Y']

In [13]:
df['new1'] = [1,2,3,4,5]

In [14]:
df

Unnamed: 0,W,X,Y,Z,new,new1
A,-0.767763,0.660504,0.668966,-0.463456,-0.098796,1
B,1.255684,-0.064614,0.256751,-1.808088,1.512435,2
C,-1.002467,0.92307,0.722617,1.314495,-0.279851,3
D,-2.068051,0.985353,-0.034651,-0.089781,-2.102702,4
E,-0.544011,0.099837,-0.636896,0.293113,-1.180907,5


In [15]:
randn(5,2)

array([[-0.88079861, -0.76467712],
       [-1.16502789,  0.05966052],
       [ 0.15418213, -0.26669924],
       [-0.63395441, -0.77129099],
       [-0.22988838, -0.68072576]])

In [16]:
df['new1'] = randn(5,2)

In [17]:
df

Unnamed: 0,W,X,Y,Z,new,new1
A,-0.767763,0.660504,0.668966,-0.463456,-0.098796,-1.327585
B,1.255684,-0.064614,0.256751,-1.808088,1.512435,-0.633915
C,-1.002467,0.92307,0.722617,1.314495,-0.279851,1.276368
D,-2.068051,0.985353,-0.034651,-0.089781,-2.102702,1.213534
E,-0.544011,0.099837,-0.636896,0.293113,-1.180907,-0.277772


** Removing Columns**

In [18]:
df.drop('A')

Unnamed: 0,W,X,Y,Z,new,new1
B,1.255684,-0.064614,0.256751,-1.808088,1.512435,-0.633915
C,-1.002467,0.92307,0.722617,1.314495,-0.279851,1.276368
D,-2.068051,0.985353,-0.034651,-0.089781,-2.102702,1.213534
E,-0.544011,0.099837,-0.636896,0.293113,-1.180907,-0.277772


In [19]:
# Not inplace unless specified!
df

Unnamed: 0,W,X,Y,Z,new,new1
A,-0.767763,0.660504,0.668966,-0.463456,-0.098796,-1.327585
B,1.255684,-0.064614,0.256751,-1.808088,1.512435,-0.633915
C,-1.002467,0.92307,0.722617,1.314495,-0.279851,1.276368
D,-2.068051,0.985353,-0.034651,-0.089781,-2.102702,1.213534
E,-0.544011,0.099837,-0.636896,0.293113,-1.180907,-0.277772


In [20]:
dfd=df.drop("B")

In [21]:
dfd

Unnamed: 0,W,X,Y,Z,new,new1
A,-0.767763,0.660504,0.668966,-0.463456,-0.098796,-1.327585
C,-1.002467,0.92307,0.722617,1.314495,-0.279851,1.276368
D,-2.068051,0.985353,-0.034651,-0.089781,-2.102702,1.213534
E,-0.544011,0.099837,-0.636896,0.293113,-1.180907,-0.277772


In [22]:
df.drop('new',axis=1,inplace=True)

In [23]:
df

Unnamed: 0,W,X,Y,Z,new1
A,-0.767763,0.660504,0.668966,-0.463456,-1.327585
B,1.255684,-0.064614,0.256751,-1.808088,-0.633915
C,-1.002467,0.92307,0.722617,1.314495,1.276368
D,-2.068051,0.985353,-0.034651,-0.089781,1.213534
E,-0.544011,0.099837,-0.636896,0.293113,-0.277772


Can also drop rows this way:

In [24]:
df.drop('E',axis=0)

Unnamed: 0,W,X,Y,Z,new1
A,-0.767763,0.660504,0.668966,-0.463456,-1.327585
B,1.255684,-0.064614,0.256751,-1.808088,-0.633915
C,-1.002467,0.92307,0.722617,1.314495,1.276368
D,-2.068051,0.985353,-0.034651,-0.089781,1.213534


** Selecting Rows**

In [25]:
df

Unnamed: 0,W,X,Y,Z,new1
A,-0.767763,0.660504,0.668966,-0.463456,-1.327585
B,1.255684,-0.064614,0.256751,-1.808088,-0.633915
C,-1.002467,0.92307,0.722617,1.314495,1.276368
D,-2.068051,0.985353,-0.034651,-0.089781,1.213534
E,-0.544011,0.099837,-0.636896,0.293113,-0.277772


In [26]:
df.loc[['A',"B"],"new1"]

A   -1.327585
B   -0.633915
Name: new1, dtype: float64

Or select based off of position instead of label 

In [27]:
df

Unnamed: 0,W,X,Y,Z,new1
A,-0.767763,0.660504,0.668966,-0.463456,-1.327585
B,1.255684,-0.064614,0.256751,-1.808088,-0.633915
C,-1.002467,0.92307,0.722617,1.314495,1.276368
D,-2.068051,0.985353,-0.034651,-0.089781,1.213534
E,-0.544011,0.099837,-0.636896,0.293113,-0.277772


In [28]:
#df.iloc['W','A']
df.iloc[2:4,1:]

Unnamed: 0,X,Y,Z,new1
C,0.92307,0.722617,1.314495,1.276368
D,0.985353,-0.034651,-0.089781,1.213534


** Selecting subset of rows and columns **

In [38]:
df

Unnamed: 0,index,W,X,Y,Z,new1
0,A,-0.767763,0.660504,0.668966,-0.463456,-1.327585
1,B,1.255684,-0.064614,0.256751,-1.808088,-0.633915
2,C,-1.002467,0.92307,0.722617,1.314495,1.276368
3,D,-2.068051,0.985353,-0.034651,-0.089781,1.213534
4,E,-0.544011,0.099837,-0.636896,0.293113,-0.277772


In [30]:
df.loc['B','Y']

0.2567508599091483

In [31]:
df.loc[['A','C'],['X','Z']]

Unnamed: 0,X,Z
A,0.660504,-0.463456
C,0.92307,1.314495


### Conditional Selection

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [None]:
df['name']=['amas','tama','aaap','cap','qw']
df

In [None]:
df.drop("name",axis =1,inplace = True)

In [33]:
df>0

Unnamed: 0,W,X,Y,Z,new1
A,False,True,True,False,False
B,True,False,True,False,False
C,False,True,True,True,True
D,False,True,False,False,True
E,False,True,False,True,False


In [None]:
df[df>0]

In [None]:
df[df['W_1']==0]


In [None]:
df

In [34]:
df[df['W']>0]["new1"]

B   -0.633915
Name: new1, dtype: float64

In [35]:
df[df['W']>0][['Y','X',"W"]]

Unnamed: 0,Y,X,W
B,0.256751,-0.064614,1.255684


In [None]:
df.dropna

For two conditions you can use | and & with parenthesis:

In [None]:
df[(df['W']>0)]

## More Index Details

More features of indexing, including resetting the index or setting it something else. 

In [None]:
df

In [36]:
df

Unnamed: 0,W,X,Y,Z,new1
A,-0.767763,0.660504,0.668966,-0.463456,-1.327585
B,1.255684,-0.064614,0.256751,-1.808088,-0.633915
C,-1.002467,0.92307,0.722617,1.314495,1.276368
D,-2.068051,0.985353,-0.034651,-0.089781,1.213534
E,-0.544011,0.099837,-0.636896,0.293113,-0.277772


In [41]:
# Reset to default 0,1...n index
df.reset_index(inplace = True)

ValueError: cannot insert level_0, already exists

In [42]:
df

Unnamed: 0,level_0,index,W,X,Y,Z,new1
0,0,A,-0.767763,0.660504,0.668966,-0.463456,-1.327585
1,1,B,1.255684,-0.064614,0.256751,-1.808088,-0.633915
2,2,C,-1.002467,0.92307,0.722617,1.314495,1.276368
3,3,D,-2.068051,0.985353,-0.034651,-0.089781,1.213534
4,4,E,-0.544011,0.099837,-0.636896,0.293113,-0.277772


In [43]:
df.columns

Index(['level_0', 'index', 'W', 'X', 'Y', 'Z', 'new1'], dtype='object')

In [44]:
newind = 'CA NY WY OR CO'.split()

In [None]:
newind

In [45]:
df['States'] = newind

In [46]:
df

Unnamed: 0,level_0,index,W,X,Y,Z,new1,States
0,0,A,-0.767763,0.660504,0.668966,-0.463456,-1.327585,CA
1,1,B,1.255684,-0.064614,0.256751,-1.808088,-0.633915,NY
2,2,C,-1.002467,0.92307,0.722617,1.314495,1.276368,WY
3,3,D,-2.068051,0.985353,-0.034651,-0.089781,1.213534,OR
4,4,E,-0.544011,0.099837,-0.636896,0.293113,-0.277772,CO


In [None]:
df.set_index('States' ,inplace = True)

In [None]:
df

In [None]:
df.reset_index(inplace = True)

In [None]:
df.set_index('States',inplace=True)

In [None]:
df

# END

In [None]:
import numpy as np

In [None]:
a=np.array([[1,3,4],[9,0,1],[1,0,3]])

In [None]:
a

In [None]:
a==1

In [None]:
a[a==1]=22

In [None]:
a

In [None]:
df.iloc[1]

In [None]:
pd.factorize(df.iloc[1])

In [None]:
d1={"Gender":['M','F','M','M','F'],"Age":[10,23,22,12,33]}

# practice


In [None]:
df = pd.DataFrame(np.random.randn(5))