Data Frames are row and column series

In [37]:
import numpy as np

In [38]:
import pandas as pd

In [39]:
from numpy.random import randn

In [40]:
df = pd.DataFrame(randn(5,4),columns = ['A','B','C','D'])

In [41]:
df

Unnamed: 0,A,B,C,D
0,-0.257519,0.519185,0.262559,-0.344353
1,-1.023082,0.324145,0.020226,-0.454271
2,-2.02888,0.78309,-0.599509,2.055036
3,1.274471,0.382327,0.151419,-0.284244
4,1.289736,0.327073,1.043527,-0.110551


To gather single column

In [42]:
df['A']

0   -0.257519
1   -1.023082
2   -2.028880
3    1.274471
4    1.289736
Name: A, dtype: float64

To gather multiple columns

In [43]:
df[['A','B']]

Unnamed: 0,A,B
0,-0.257519,0.519185
1,-1.023082,0.324145
2,-2.02888,0.78309
3,1.274471,0.382327
4,1.289736,0.327073


To add a new column

In [44]:
df['E'] = df['A'] + df['B']

In [45]:
df

Unnamed: 0,A,B,C,D,E
0,-0.257519,0.519185,0.262559,-0.344353,0.261665
1,-1.023082,0.324145,0.020226,-0.454271,-0.698938
2,-2.02888,0.78309,-0.599509,2.055036,-1.245789
3,1.274471,0.382327,0.151419,-0.284244,1.656799
4,1.289736,0.327073,1.043527,-0.110551,1.616809


To drop a column

In [46]:
df.drop('E',axis=1,inplace=True)

In [47]:
df

Unnamed: 0,A,B,C,D
0,-0.257519,0.519185,0.262559,-0.344353
1,-1.023082,0.324145,0.020226,-0.454271
2,-2.02888,0.78309,-0.599509,2.055036
3,1.274471,0.382327,0.151419,-0.284244
4,1.289736,0.327073,1.043527,-0.110551


In [48]:
type(df)

pandas.core.frame.DataFrame

In [49]:
type(df['A'])

pandas.core.series.Series

To filter data frame based on a row condition on multiple columns

In [50]:
df[(df['A'] > 1) & (df['B'] > 0)][['A','B','C']]

Unnamed: 0,A,B,C
3,1.274471,0.382327,0.151419
4,1.289736,0.327073,1.043527


Missing data

Using copy method while creating a new data frame will help us to avoid overriting existing one

In [51]:
df_missing = df.copy()

Updating column A with a condition where A > 1 will be NaN

In [52]:
df_missing.loc[df_missing['A'] > 0, 'A'] = np.NaN

In [53]:
df_missing.loc[df_missing['B'] > 0, 'B'] = np.NaN

In [54]:
df_missing

Unnamed: 0,A,B,C,D
0,-0.257519,,0.262559,-0.344353
1,-1.023082,,0.020226,-0.454271
2,-2.02888,,-0.599509,2.055036
3,,,0.151419,-0.284244
4,,,1.043527,-0.110551


In [55]:
df_missing['A'] = df_missing['A'].fillna(value=99)

In [56]:
df_missing['B'] = df_missing['B'].fillna(value=df_missing['B'].mean())

In [57]:
df_missing

Unnamed: 0,A,B,C,D
0,-0.257519,,0.262559,-0.344353
1,-1.023082,,0.020226,-0.454271
2,-2.02888,,-0.599509,2.055036
3,99.0,,0.151419,-0.284244
4,99.0,,1.043527,-0.110551


Group by method

To create a conditional column based on a column

In [71]:
df['Category'] = np.where(df['C']>=0, 'A', 'B')

In [72]:
df

Unnamed: 0,A,B,C,D,Category
0,-0.257519,0.519185,0.262559,-0.344353,A
1,-1.023082,0.324145,0.020226,-0.454271,A
2,-2.02888,0.78309,-0.599509,2.055036,B
3,1.274471,0.382327,0.151419,-0.284244,A
4,1.289736,0.327073,1.043527,-0.110551,A


In [60]:
df.groupby('Category').sum()

Unnamed: 0_level_0,A,B,C,D
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,-0.745274,2.33582,0.878222,0.861616


In [61]:
df.groupby('Category')['C'].sum()

Category
A    0.878222
Name: C, dtype: float64

In [62]:
df_missing['Category'] = np.where(df_missing['B']>=0, 'A', 'B')

In [63]:
df_missing

Unnamed: 0,A,B,C,D,Category
0,-0.257519,,0.262559,-0.344353,B
1,-1.023082,,0.020226,-0.454271,B
2,-2.02888,,-0.599509,2.055036,B
3,99.0,,0.151419,-0.284244,B
4,99.0,,1.043527,-0.110551,B


In [64]:
df

Unnamed: 0,A,B,C,D,Category
0,-0.257519,0.519185,0.262559,-0.344353,A
1,-1.023082,0.324145,0.020226,-0.454271,A
2,-2.02888,0.78309,-0.599509,2.055036,A
3,1.274471,0.382327,0.151419,-0.284244,A
4,1.289736,0.327073,1.043527,-0.110551,A


To append data frames

In [65]:
pd.concat([df,df_missing])

Unnamed: 0,A,B,C,D,Category
0,-0.257519,0.519185,0.262559,-0.344353,A
1,-1.023082,0.324145,0.020226,-0.454271,A
2,-2.02888,0.78309,-0.599509,2.055036,A
3,1.274471,0.382327,0.151419,-0.284244,A
4,1.289736,0.327073,1.043527,-0.110551,A
0,-0.257519,,0.262559,-0.344353,B
1,-1.023082,,0.020226,-0.454271,B
2,-2.02888,,-0.599509,2.055036,B
3,99.0,,0.151419,-0.284244,B
4,99.0,,1.043527,-0.110551,B


To gather unique values in a column

In [76]:
df['Category'].unique()

array(['A', 'B'], dtype=object)

To gather number of unique values in a column

In [78]:
df['Category'].nunique()

2

To gather how many times a unique value occurs

In [80]:
df['Category'].value_counts()

A    4
B    1
Name: Category, dtype: int64

To apply a function to a data frame column

In [81]:
def times2(x):
    return x**2

In [82]:
df['A'].apply(times2)

0    0.066316
1    1.046698
2    4.116352
3    1.624277
4    1.663419
Name: A, dtype: float64

In [83]:
df['Category'].apply(len)

0    1
1    1
2    1
3    1
4    1
Name: Category, dtype: int64