In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
import matplotlib.pyplot as plt

### Pandas Series

In [6]:
temps = [39, 45, 60, 90]
temp_ser = pd.Series(temps, name='Temperature')

In [7]:
temp_ser

0    39
1    45
2    60
3    90
Name: Temperature, dtype: int64

In [8]:
temp_ser.mean()       

58.5

In [12]:
temp_ser.sum()

234

In [14]:
# print(dir(temp_ser))

#### boolean array

In [15]:
hot = pd.Series([False, False, True, True])
hot

0    False
1    False
2     True
3     True
dtype: bool

In [16]:
temp_ser[hot]

2    60
3    90
Name: Temperature, dtype: int64

In [18]:
mask = temp_ser > 55
mask2 = temp_ser < 90

In [20]:
temp_ser[mask & mask2]

2    60
Name: Temperature, dtype: int64

In [22]:
temp_ser[mask | mask2]

0    39
1    45
2    60
3    90
Name: Temperature, dtype: int64

In [23]:
temp_ser[~mask]

0    39
1    45
Name: Temperature, dtype: int64

In [24]:
temp_ser[(temp_ser > 55) & (temp_ser < 90)]

2    60
Name: Temperature, dtype: int64

#### Index

In [25]:
temp_ser.index

RangeIndex(start=0, stop=4, step=1)

In [27]:
temp2 = pd.Series(temps, name='Temp2', index=['M','T','W','Th'])

In [29]:
temp2

M     39
T     45
W     60
Th    90
Name: Temp2, dtype: int64

In [30]:
temp2.index

Index([u'M', u'T', u'W', u'Th'], dtype='object')

In [32]:
temp2['M']

39

In [39]:
temp2[temp2 == 39].index.values[0]

'M'

In [43]:
temp2[temp2 > 50].index.values

array(['W', 'Th'], dtype=object)

In [45]:
dates = pd.date_range('20160101', periods=4)
dates

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04'], dtype='datetime64[ns]', freq='D')

In [46]:
temp3 = pd.Series(temps, name='Temp3', index=dates)

In [48]:
temp3

2016-01-01    39
2016-01-02    45
2016-01-03    60
2016-01-04    90
Freq: D, Name: Temp3, dtype: int64

In [50]:
temp3.index

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04'], dtype='datetime64[ns]', freq='D')

In [51]:
temp4 = pd.Series(temps, name='temp4', index=[0,1,1,3])

In [52]:
temp4

0    39
1    45
1    60
3    90
Name: temp4, dtype: int64

In [54]:
## Duplicate index entries are allowed
temp4[1]

1    45
1    60
Name: temp4, dtype: int64

#### Pandas data types

In [60]:
ser = pd.Series(range(10))
ser2 = pd.Series([1.1,2,3,4])
ser3 = pd.Series(['a','b','c'])
ser3
print(ser.dtype, ser2.dtype, ser3.dtype)
# ser is int64, ser2 is float64, ser3 is python object


(dtype('int64'), dtype('float64'), dtype('O'))


In [65]:
### Mixed types in a series
ser4 = pd.Series([{'name': 'Rupen'}, ['a','b','c'], (2,3)])
print(ser4)
print(ser4.dtype)

0    {u'name': u'Rupen'}
1              [a, b, c]
2                 (2, 3)
dtype: object
object


In [70]:
ser5 = pd.Series(['2015-01-01', '2016-01-01', '2017-01-01'])
print(ser5.dtype)
ser6 = pd.to_datetime(ser5)
print(ser6.dtype)
ser6

object
datetime64[ns]


0   2015-01-01
1   2016-01-01
2   2017-01-01
dtype: datetime64[ns]

In [71]:
## Categorical datatype
ser7 = pd.Series(['Prius', 'Accord', 'Prius', 'Camry'])
ser8 = pd.Series(['Prius', 'Accord', 'Prius', 'Camry'], dtype='category')
ser8

0     Prius
1    Accord
2     Prius
3     Camry
dtype: category
Categories (3, object): [Accord, Camry, Prius]

### More series

In [87]:
temps = [30,45,60,90]
s = pd.Series(temps)
for num in s:
    print(num)
print(45 in temps)
print(47 in temps)

## In clause looks at index in series
print(45 in s.values)
print(45 in set(s))

30
45
60
90
True
False
True
True


In [89]:
dict(s)

{0: 30, 1: 45, 2: 60, 3: 90}

In [92]:
for i, val in s.iteritems():
    print(i, val)

(0, 30)
(1, 45)
(2, 60)
(3, 90)


### Broadcasting

In [102]:
s + 2
s + s
s - s
s * s
s + 100
s > 10
## Bitwise operations
s ^ 4
s | 4
s & 4

0    4
1    4
2    4
3    0
dtype: int64

In [104]:
s2 = pd.Series([10,20,30], index=[2,3,4])
s + s2

0      NaN
1      NaN
2     70.0
3    110.0
4      NaN
dtype: float64

In [108]:
def add_2(val):
    return val + 2

def add_num(val, num):
    return val + num

s.apply(add_2)
s.apply(add_num, args=(10,))

0     40
1     55
2     70
3    100
dtype: int64

In [113]:
s.apply(float)
s.astype(float)
s.astype(str) + " test"

0    30 test
1    45 test
2    60 test
3    90 test
dtype: object

## CRUD

In [130]:
# By Index
s[0]

# By Label
s.loc[1]

# Relative position
s.iloc[0]
s.iloc[-1]
s.iloc[-2]

60

In [133]:
temp2 = pd.Series(temps, index=['M','T','W','Th'])
temp2

M     30
T     45
W     60
Th    90
dtype: int64

In [136]:
## On non-integer index, works on both location and position
temp2[0]
temp2['M']
temp2[-1]

90

In [142]:
temp2.loc['M']
# temp2.loc[1] ## Will not work
temp2.iloc[0]
temp2.iloc[-1]

90

In [144]:
temp2.M

30

In [152]:
temp2.T ## T is a property(transpose)
temp2.transpose()

M     30
T     45
W     60
Th    90
dtype: int64

In [154]:
## Mixed index (Integer and String)
temp3 = pd.Series(temps, index=['M', 'T', 0, 1])
temp3

M    30
T    45
0    60
1    90
dtype: int64

In [156]:
temp3['M']

30

In [159]:
temp3[1]

90

In [162]:
temp3[-1]

90

### Update

In [164]:
temp2

M     30
T     45
W     60
Th    90
dtype: int64

In [165]:
temp2.W = 70
temp2

M     30
T     45
W     70
Th    90
dtype: int64

In [174]:
temp2['W'] = 60
temp2.T = 50
temp2
temp2.iloc[-1] = 100
temp2.iloc[3] = 90
temp2.iloc[1] = 45
temp2.loc['M'] = 35
temp2.loc['M'] = 30
temp2

M     30
T     45
W     60
Th    90
dtype: int64

In [180]:
temp2.append(pd.Series(100, index=['F'])) ## More like extend on list

M      30
T      45
W      60
Th     90
F     100
dtype: int64

In [185]:
temp2.set_value('M', 10) ## Sets value in place and also returns the series
temp2
temp2.set_value('S', 95)
temp2

M     10
T     45
W     60
Th    90
S     95
dtype: int64

## Delete

In [187]:
## Regular Python Dictionary
d = {'foo': 'bar'}
del d['foo']
d

{}

In [None]:
del temp2['S']

In [191]:
temp2

M     10
T     45
W     60
Th    90
dtype: int64

In [194]:
temp2[temp2 < 94]

M     10
T     45
W     60
Th    90
dtype: int64

In [197]:
mask = temp2.index == 'T'
temp2[mask]

T    45
dtype: int64

### Summary Statistics

In [200]:
temp2.M = 30
temp2

M     30
T     45
W     60
Th    90
dtype: int64

In [201]:
temp2.mean()

56.25

In [202]:
temp2.median()

52.5

In [203]:
temp2.mode()

Series([], dtype: int64)

In [204]:
temp2.describe() ## Returns a series of stats

count     4.000000
mean     56.250000
std      25.617377
min      30.000000
25%      41.250000
50%      52.500000
75%      67.500000
max      90.000000
dtype: float64

In [205]:
temp2.value_counts() ## Grouped categorical data

30    1
45    1
60    1
90    1
dtype: int64

In [207]:
ser8
ser8.describe()

count         4
unique        3
top       Prius
freq          2
dtype: object

In [208]:
ser8.value_counts()

Prius     2
Camry     1
Accord    1
dtype: int64

In [210]:
temp2.quantile(.1)

34.5

In [213]:
temp2.describe(percentiles=[0.05,0.1,0.2])

count     4.000000
mean     56.250000
std      25.617377
min      30.000000
5%       32.250000
10%      34.500000
20%      39.000000
50%      52.500000
max      90.000000
dtype: float64

## Duplicates

In [216]:
temp3 = temp2.append(pd.Series([60], index=['M']))
temp3

M     30
T     45
W     60
Th    90
M     60
dtype: int64

In [222]:
temp3.duplicated()

M     False
T     False
W     False
Th    False
M      True
dtype: bool

In [218]:
temp3.duplicated().any()

True

In [219]:
temp3.duplicated().all()

False

In [225]:
temp3.drop_duplicates()

M     30
T     45
W     60
Th    90
dtype: int64

## NaN

In [228]:
temp6 = pd.Series([1, 3, None])
temp6

0    1.0
1    3.0
2    NaN
dtype: float64

In [230]:
pd.Series([None])

0    None
dtype: object

In [232]:
pd.Series([np.nan])

0   NaN
dtype: float64

In [234]:
temp7 = temp3.append(pd.Series([100, None], index=['F', 'Sun']))
temp7

M       30.0
T       45.0
W       60.0
Th      90.0
M       60.0
F      100.0
Sun      NaN
dtype: float64

In [235]:
len(temp7)

7

In [236]:
temp7.count()

6

In [237]:
temp7.mean()

64.166666666666671

In [239]:
temp7[~temp7.isnull()]

M      30.0
T      45.0
W      60.0
Th     90.0
M      60.0
F     100.0
dtype: float64

In [240]:
temp7.isnull().any()

True

In [241]:
temp7.notnull()

M       True
T       True
W       True
Th      True
M       True
F       True
Sun    False
dtype: bool

In [242]:
temp7.dropna()

M      30.0
T      45.0
W      60.0
Th     90.0
M      60.0
F     100.0
dtype: float64

In [244]:
temp7.fillna(0)

M       30.0
T       45.0
W       60.0
Th      90.0
M       60.0
F      100.0
Sun      0.0
dtype: float64

In [246]:
# temp7.fillna(method='ffil') # Python 3 only