In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [3]:
obj = Series(np.arange(4), index=['a', 'b', 'c', 'd'])
print(obj)

a    0
b    1
c    2
d    3
dtype: int64


In [4]:
obj.a

0

In [5]:
obj.d

3

In [6]:
obj[-1]

3

In [9]:
np.random.seed(25)
df = DataFrame(np.random.randn(5, 4), 
                index=['a', 'b', 'c', 'd', 'e'], 
                columns=['A', 'B', 'C', 'D'])

df

Unnamed: 0,A,B,C,D
a,0.228273,1.02689,-0.839585,-0.591182
b,-0.956888,-0.222326,-0.619915,1.837905
c,-2.053231,0.868583,-0.920734,-0.232312
d,2.152957,-1.334661,0.07638,-1.246089
e,1.202272,-1.049942,1.05661,-0.419678


In [10]:
df.loc[['a', 'b'], ['A', 'B']]

Unnamed: 0,A,B
a,0.228273,1.02689
b,-0.956888,-0.222326


data slicing

In [11]:
obj['a' : 'c']

a    0
b    1
c    2
dtype: int64

comparing with scalars

In [13]:
df

Unnamed: 0,A,B,C,D
a,0.228273,1.02689,-0.839585,-0.591182
b,-0.956888,-0.222326,-0.619915,1.837905
c,-2.053231,0.868583,-0.920734,-0.232312
d,2.152957,-1.334661,0.07638,-1.246089
e,1.202272,-1.049942,1.05661,-0.419678


In [15]:
df < .2

Unnamed: 0,A,B,C,D
a,False,False,True,True
b,True,True,True,False
c,True,False,True,True
d,False,True,True,True
e,False,True,False,True


filtering with scalars

In [17]:
obj[obj > 2]

d    3
dtype: int64

setting values with scalars

In [21]:
obj

a    0
b    1
c    2
d    3
dtype: int64

In [22]:
obj['a', 'c'] = 5
obj

a    5
b    1
c    5
d    3
dtype: int64

In [19]:
np.random.seed(25)
df = DataFrame(np.random.randn(10, 10), 
                index=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], 
                columns=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'])


In [20]:
df

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
a,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326,-0.619915,1.837905,-2.053231,0.868583
b,-0.920734,-0.232312,2.152957,-1.334661,0.07638,-1.246089,1.202272,-1.049942,1.05661,-0.419678
c,2.294842,-2.594487,2.822756,0.680889,-1.577693,-1.976254,0.53334,-0.29087,-0.51352,1.982626
d,0.226001,-1.839905,1.607671,0.388292,0.399732,0.405477,0.217002,-0.633439,0.246622,-1.939546
e,0.11406,-1.885341,0.24308,-0.705481,0.364628,-0.502952,-0.225752,-0.565538,0.103395,2.018408
f,1.094248,1.662434,-0.627453,1.6212,1.178133,-0.374879,-0.544329,0.287761,-0.20582,1.189988
g,0.728927,-0.22204,-1.622706,0.312541,-1.160421,0.31356,0.471998,0.577862,0.505407,-0.626488
h,-0.346369,-2.065942,0.768936,1.128866,0.166924,-0.967255,0.491996,-0.550857,-0.084694,1.967568
i,-0.062675,-0.851136,0.426521,-0.277561,-1.377945,-0.095196,0.833639,-0.784754,1.046145,-0.645784
j,-1.891579,-0.097333,-1.358895,0.49831,-1.147321,-0.536521,-0.916489,-0.212148,0.192844,-0.322683


TREATING MISSING VALUES

In [23]:
missing_data = np.nan #nan: not a number
obj = Series([1, 2, missing_data, 4, missing_data], index=['a', 'b', 'c', 'd', 'e'])
obj

a    1.0
b    2.0
c    NaN
d    4.0
e    NaN
dtype: float64

In [25]:
obj.isnull()

a    False
b    False
c     True
d    False
e     True
dtype: bool

filling in missing values

In [26]:
np.random.seed(25)
df = DataFrame(np.random.randn(10, 10), 
                index=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], 
                columns=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'])
df

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
a,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326,-0.619915,1.837905,-2.053231,0.868583
b,-0.920734,-0.232312,2.152957,-1.334661,0.07638,-1.246089,1.202272,-1.049942,1.05661,-0.419678
c,2.294842,-2.594487,2.822756,0.680889,-1.577693,-1.976254,0.53334,-0.29087,-0.51352,1.982626
d,0.226001,-1.839905,1.607671,0.388292,0.399732,0.405477,0.217002,-0.633439,0.246622,-1.939546
e,0.11406,-1.885341,0.24308,-0.705481,0.364628,-0.502952,-0.225752,-0.565538,0.103395,2.018408
f,1.094248,1.662434,-0.627453,1.6212,1.178133,-0.374879,-0.544329,0.287761,-0.20582,1.189988
g,0.728927,-0.22204,-1.622706,0.312541,-1.160421,0.31356,0.471998,0.577862,0.505407,-0.626488
h,-0.346369,-2.065942,0.768936,1.128866,0.166924,-0.967255,0.491996,-0.550857,-0.084694,1.967568
i,-0.062675,-0.851136,0.426521,-0.277561,-1.377945,-0.095196,0.833639,-0.784754,1.046145,-0.645784
j,-1.891579,-0.097333,-1.358895,0.49831,-1.147321,-0.536521,-0.916489,-0.212148,0.192844,-0.322683


In [31]:
df.iloc[1:3, 0:3] = missing_data
df

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,0,5
a,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326,-0.619915,1.837905,-2.053231,0.868583,,
b,,,,-1.334661,0.07638,,1.202272,-1.049942,1.05661,-0.419678,,
c,,,,0.680889,-1.577693,,0.53334,-0.29087,-0.51352,1.982626,,
d,,-1.839905,1.607671,0.388292,0.399732,,0.217002,-0.633439,0.246622,-1.939546,,
e,,-1.885341,0.24308,-0.705481,0.364628,-0.502952,-0.225752,-0.565538,0.103395,2.018408,,
f,1.094248,1.662434,-0.627453,1.6212,1.178133,-0.374879,-0.544329,0.287761,-0.20582,1.189988,,
g,0.728927,-0.22204,-1.622706,0.312541,-1.160421,0.31356,0.471998,0.577862,0.505407,-0.626488,,
h,-0.346369,-2.065942,0.768936,1.128866,0.166924,-0.967255,0.491996,-0.550857,-0.084694,1.967568,,
i,-0.062675,-0.851136,0.426521,-0.277561,-1.377945,-0.095196,0.833639,-0.784754,1.046145,-0.645784,,
j,-1.891579,-0.097333,-1.358895,0.49831,-1.147321,-0.536521,-0.916489,-0.212148,0.192844,-0.322683,,


In [34]:
df.drop(0, axis=1, inplace=True)

In [36]:
df.drop(5, axis=1, inplace=True)

In [37]:
df

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
a,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326,-0.619915,1.837905,-2.053231,0.868583
b,,,,-1.334661,0.07638,,1.202272,-1.049942,1.05661,-0.419678
c,,,,0.680889,-1.577693,,0.53334,-0.29087,-0.51352,1.982626
d,,-1.839905,1.607671,0.388292,0.399732,,0.217002,-0.633439,0.246622,-1.939546
e,,-1.885341,0.24308,-0.705481,0.364628,-0.502952,-0.225752,-0.565538,0.103395,2.018408
f,1.094248,1.662434,-0.627453,1.6212,1.178133,-0.374879,-0.544329,0.287761,-0.20582,1.189988
g,0.728927,-0.22204,-1.622706,0.312541,-1.160421,0.31356,0.471998,0.577862,0.505407,-0.626488
h,-0.346369,-2.065942,0.768936,1.128866,0.166924,-0.967255,0.491996,-0.550857,-0.084694,1.967568
i,-0.062675,-0.851136,0.426521,-0.277561,-1.377945,-0.095196,0.833639,-0.784754,1.046145,-0.645784
j,-1.891579,-0.097333,-1.358895,0.49831,-1.147321,-0.536521,-0.916489,-0.212148,0.192844,-0.322683


In [40]:
df.isna().sum()


A    4
B    2
C    2
D    0
E    0
F    3
G    0
H    0
I    0
J    0
dtype: int64

In [41]:
df = df.fillna(0)
df

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
a,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326,-0.619915,1.837905,-2.053231,0.868583
b,0.0,0.0,0.0,-1.334661,0.07638,0.0,1.202272,-1.049942,1.05661,-0.419678
c,0.0,0.0,0.0,0.680889,-1.577693,0.0,0.53334,-0.29087,-0.51352,1.982626
d,0.0,-1.839905,1.607671,0.388292,0.399732,0.0,0.217002,-0.633439,0.246622,-1.939546
e,0.0,-1.885341,0.24308,-0.705481,0.364628,-0.502952,-0.225752,-0.565538,0.103395,2.018408
f,1.094248,1.662434,-0.627453,1.6212,1.178133,-0.374879,-0.544329,0.287761,-0.20582,1.189988
g,0.728927,-0.22204,-1.622706,0.312541,-1.160421,0.31356,0.471998,0.577862,0.505407,-0.626488
h,-0.346369,-2.065942,0.768936,1.128866,0.166924,-0.967255,0.491996,-0.550857,-0.084694,1.967568
i,-0.062675,-0.851136,0.426521,-0.277561,-1.377945,-0.095196,0.833639,-0.784754,1.046145,-0.645784
j,-1.891579,-0.097333,-1.358895,0.49831,-1.147321,-0.536521,-0.916489,-0.212148,0.192844,-0.322683


In [42]:
np.random.seed(50)
df = DataFrame(np.random.randn(10, 10))
df.iloc[3:5, 0] = np.nan
df.iloc[1:4, 5] = np.nan
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.560352,-0.030978,-0.620928,-1.46458,1.411946,-0.476732,-0.780469,1.070268,-1.282293,-1.327479
1,0.126338,0.862194,0.696737,-0.334565,-0.997526,,3.314075,0.98777,0.123866,0.742785
2,-0.393956,0.148116,-0.412234,-0.160715,0.139531,,-0.281262,1.710907,-0.149767,0.690307
3,,1.338409,-1.368982,0.486428,0.753522,,-0.31471,1.373281,-0.624417,0.375754
4,,0.743038,0.857362,-1.506189,-1.666352,-0.218995,-0.358858,0.378528,0.684215,-1.167856
5,-0.793217,-0.038835,2.705255,-1.491389,0.095978,0.524687,0.815666,0.051506,-0.164555,0.278199
6,0.087115,0.034268,0.746569,-0.943638,-0.245777,1.108121,0.039008,-0.213267,-0.890955,-0.270646
7,0.222244,0.251814,0.707926,0.49399,1.471002,-0.582619,2.065819,1.087834,0.805189,-1.587885
8,1.230885,-2.373453,-0.031111,-3.80989,-0.19905,0.350431,-0.052551,-0.634281,-0.362931,-2.57092
9,0.137515,-0.719296,0.728523,1.621745,1.521662,-1.306773,-0.889768,0.005268,0.866425,-0.535536


In [43]:
df.isnull().sum()

0    2
1    0
2    0
3    0
4    0
5    3
6    0
7    0
8    0
9    0
dtype: int64

In [44]:
df = df.dropna()
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.560352,-0.030978,-0.620928,-1.46458,1.411946,-0.476732,-0.780469,1.070268,-1.282293,-1.327479
5,-0.793217,-0.038835,2.705255,-1.491389,0.095978,0.524687,0.815666,0.051506,-0.164555,0.278199
6,0.087115,0.034268,0.746569,-0.943638,-0.245777,1.108121,0.039008,-0.213267,-0.890955,-0.270646
7,0.222244,0.251814,0.707926,0.49399,1.471002,-0.582619,2.065819,1.087834,0.805189,-1.587885
8,1.230885,-2.373453,-0.031111,-3.80989,-0.19905,0.350431,-0.052551,-0.634281,-0.362931,-2.57092
9,0.137515,-0.719296,0.728523,1.621745,1.521662,-1.306773,-0.889768,0.005268,0.866425,-0.535536


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.560352,-0.030978,-0.620928,-1.46458,1.411946,-0.476732,-0.780469,1.070268,-1.282293,-1.327479
5,-0.793217,-0.038835,2.705255,-1.491389,0.095978,0.524687,0.815666,0.051506,-0.164555,0.278199
6,0.087115,0.034268,0.746569,-0.943638,-0.245777,1.108121,0.039008,-0.213267,-0.890955,-0.270646
7,0.222244,0.251814,0.707926,0.49399,1.471002,-0.582619,2.065819,1.087834,0.805189,-1.587885
8,1.230885,-2.373453,-0.031111,-3.80989,-0.19905,0.350431,-0.052551,-0.634281,-0.362931,-2.57092
9,0.137515,-0.719296,0.728523,1.621745,1.521662,-1.306773,-0.889768,0.005268,0.866425,-0.535536
