### Pandas

In [6]:
import pandas as pd
import numpy as np
print(pd.__version__)

0.25.3


In [4]:
#create a series from a list, numpy array and dict

import numpy as np
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

ser1 = pd.Series(mylist)
ser2 = pd.Series(myarr)
ser3 = pd.Series(mydict)
print(ser1.head())
print(ser2.head())
print(ser3.head())

0    a
1    b
2    c
3    e
4    d
dtype: object
0    0
1    1
2    2
3    3
4    4
dtype: int32
a    0
b    1
c    2
e    3
d    4
dtype: int64


In [5]:
#convert the index of a series into a column of a dataframe

mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

df = ser.to_frame().reset_index()
print(df.head())

  index  0
0     a  0
1     b  1
2     c  2
3     e  3
4     d  4


In [11]:
#combine many series to form a dataframe

ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

df = pd.concat([ser1, ser2], axis=1)
print(df)

    0   1
0   a   0
1   b   1
2   c   2
3   e   3
4   d   4
5   f   5
6   g   6
7   h   7
8   i   8
9   j   9
10  k  10
11  l  11
12  m  12
13  n  13
14  o  14
15  p  15
16  q  16
17  r  17
18  s  18
19  t  19
20  u  20
21  v  21
22  w  22
23  x  23
24  y  24
25  z  25


In [12]:
#assign name to the series’ index

ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))

ser.name = 'alphabets'
ser.head()

0    a
1    b
2    c
3    e
4    d
Name: alphabets, dtype: object

In [13]:
#get the items of series A not present in series B

ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

ser1[~ser1.isin(ser2)]

0    1
1    2
2    3
dtype: int64

In [14]:
#get the items not common to both series A and series B
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

ser_u = pd.Series(np.union1d(ser1, ser2))  # union
ser_i = pd.Series(np.intersect1d(ser1, ser2))  # intersect
ser_u[~ser_u.isin(ser_i)]

0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

In [17]:
#get the minimum, 25th percentile, median, 75th, and max of a numeric series

state = np.random.RandomState(100)
ser = pd.Series(state.normal(10, 5, 25))
#ser.describe()
np.percentile(ser, q=[0, 25, 50, 75, 100])

array([ 1.25117263,  7.70986507, 10.92259345, 13.36360403, 18.0949083 ])

In [18]:
#frequency counts of unique items of a series

ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))
ser.value_counts()


e    7
a    4
c    4
g    4
h    3
b    3
d    3
f    2
dtype: int64

In [19]:
#keep only top 2 most frequent values as it is and replace everything else as ‘Other’

np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))

print("Top 2 Freq:", ser.value_counts())
ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'
ser

Top 2 Freq: 1    5
4    4
2    3
dtype: int64


0         4
1         1
2         1
3         1
4     Other
5         4
6     Other
7         1
8         4
9     Other
10        1
11        4
dtype: object

In [20]:
#bin a numeric series to 10 groups of equal size
ser = pd.Series(np.random.random(20))

pd.qcut(ser, q=[0, .10, .20, .3, .4, .5, .6, .7, .8, .9, 1], 
        labels=['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']).head()

0     9th
1    10th
2     5th
3     8th
4    10th
dtype: category
Categories (10, object): [1st < 2nd < 3rd < 4th ... 7th < 8th < 9th < 10th]

In [21]:
#convert a numpy array to a dataframe of given shape
ser = pd.Series(np.random.randint(1, 10, 35))
df = pd.DataFrame(ser.values.reshape(7,5))
print(df)

   0  1  2  3  4
0  1  8  3  7  4
1  9  1  2  3  9
2  2  7  6  9  4
3  1  4  8  2  2
4  3  3  7  7  8
5  5  8  9  6  9
6  9  3  3  6  4


In [22]:
#find the positions of numbers that are multiples of 3 from a series
#argwhere - used to find the indices of array elements that are non-zero, grouped by element 
ser = pd.Series(np.random.randint(1, 10, 7))
print(ser)
np.argwhere(ser % 3==0)

0    9
1    7
2    7
3    9
4    7
5    6
6    8
dtype: int32


  return getattr(obj, method)(*args, **kwds)


array([[0],
       [3],
       [5]], dtype=int64)

In [23]:
#to extract items at given positions from a series
# take - return the elements in the given positional indices along an axis
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]
ser.take(pos)

0     a
4     e
8     i
14    o
20    u
dtype: object

In [24]:
#stack two series vertically and horizontally

ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))
#vertical 
ser1.append(ser2)
#horizontal
df = pd.concat([ser1, ser2], axis=1)
print(df)

   0  1
0  0  a
1  1  b
2  2  c
3  3  d
4  4  e


In [25]:
# get the positions of items of series A in another series B
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

[np.where(i == ser1)[0].tolist()[0] for i in ser2]

[5, 4, 0, 8]

In [26]:
#compute the mean squared error on a truth and predicted series

truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

np.mean((truth-pred)**2)

0.48340294894793123

In [27]:
#convert the first character of each element in a series to uppercase

ser = pd.Series(['how', 'to', 'kick', 'ass?'])

ser.map(lambda x: x.title())

0     How
1      To
2    Kick
3    Ass?
dtype: object

In [28]:
#calculate the number of characters in each word in a series
ser = pd.Series(['how', 'to', 'kick', 'ass?'])
ser.map(lambda x: len(x))

0    3
1    2
2    4
3    4
dtype: int64

In [29]:
#compute difference of differences between consequtive numbers of a series

ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])
print(ser.diff().tolist())
print(ser.diff().diff().tolist())

[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]
[nan, nan, 1.0, 1.0, 1.0, 1.0, 0.0, 2.0]
