In [2]:
import pandas as pd
import numpy as np

In [3]:
# How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’

np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))

ser

0     4
1     3
2     4
3     1
4     2
5     2
6     1
7     4
8     3
9     1
10    3
11    1
dtype: int32

In [4]:
ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'

ser

0         4
1     Other
2         4
3         1
4     Other
5     Other
6         1
7         4
8     Other
9         1
10    Other
11        1
dtype: object

In [5]:
# How to bin a numeric series to 10 groups of equal size

ser = pd.Series(np.random.random(20))
print(ser.head())

0    0.103742
1    0.499755
2    0.128434
3    0.521615
4    0.235161
dtype: float64


In [6]:
pd.qcut(ser, q=[0, .10, .20, .3, .4, .5, .6, .7, .8, .9, 1], 
        labels=['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']).head()

0    1st
1    6th
2    2nd
3    7th
4    4th
dtype: category
Categories (10, object): [1st < 2nd < 3rd < 4th ... 7th < 8th < 9th < 10th]

In [8]:
# How to convert a numpy array to a dataframe of given shape

ser = pd.Series(np.random.randint(1, 10, 35))

df = pd.DataFrame(ser.values.reshape(7,5))
print(df)

   0  1  2  3  4
0  4  8  9  6  6
1  7  4  5  8  1
2  7  7  5  2  4
3  8  7  5  2  6
4  7  3  9  5  3
5  3  7  4  6  4
6  6  6  8  4  8


In [11]:
# How to find the positions of numbers that are multiples of 3 from a series

ser = pd.Series(np.random.randint(1, 10, 7))
ser

0    9
1    3
2    7
3    3
4    1
5    6
6    5
dtype: int32

In [12]:
np.argwhere(ser % 3==0)

array([[0],
       [1],
       [3],
       [5]], dtype=int64)

In [13]:
# How to extract items at given positions from a series

ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]

ser.take(pos)

0     a
4     e
8     i
14    o
20    u
dtype: object

In [14]:
# How to stack two series vertically and horizontally 

ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

# Vertical
ser1.append(ser2)

0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
dtype: object

In [15]:
# Horizontal
df = pd.concat([ser1, ser2], axis=1)
print(df)

   0  1
0  0  a
1  1  b
2  2  c
3  3  d
4  4  e


In [16]:
#How to get the positions of items of series A in another series B

ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

# Solution 1
[np.where(i == ser1)[0].tolist()[0] for i in ser2]

# Solution 2
[pd.Index(ser1).get_loc(i) for i in ser2]

[5, 4, 0, 8]

In [18]:
# How to compute the mean squared error on a truth and predicted series

truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

np.mean((truth-pred)**2)

0.2877376750217

In [20]:
# How to convert the first character of each element in a series to uppercase

ser = pd.Series(['how', 'to', 'check', 'element'])

# Solution 1
ser.map(lambda x: x.title())

# Solution 2
ser.map(lambda x: x[0].upper() + x[1:])

# Solution 3
pd.Series([i.title() for i in ser])

0        How
1         To
2      Check
3    Element
dtype: object

In [None]:
# How to calculate the number of characters in each word in a series

In [21]:
ser = pd.Series(['how', 'to', 'check', 'element'])

ser.map(lambda x: len(x))

0    3
1    2
2    5
3    7
dtype: int64

In [22]:
# How to compute difference of differences between consequtive numbers of a series

ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])


print(ser.diff().tolist())
print(ser.diff().diff().tolist())

[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]
[nan, nan, 1.0, 1.0, 1.0, 1.0, 0.0, 2.0]
