In [1]:
import numpy as np
import pandas as pd

In [2]:
# How to create a series from a list, numpy array and dict?
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

pd.Series(mylist)
pd.Series(myarr)
pd.Series(mydict)
print('Done')

Done


In [3]:
# How to convert the index of a series into a column of a dataframe?

ser = pd.Series(mydict)

pd.DataFrame(ser).reset_index()
print('Done')


Done


In [4]:
# How to combine many series to form a dataframe?

ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

pd.concat([ser1,ser2],axis = 1)
print('Done')


Done


In [5]:
# How to assign name to the series’ index?
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser.rename_axis('name')
print('Done')

Done


In [6]:
# How to get the items of series A not present in series B?

ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

ser1[~ser1.isin(ser2)]


0    1
1    2
2    3
dtype: int64

In [7]:
# How to get the items not common to both series A and series B?

ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

ser_u = pd.Series(np.union1d(ser1, ser2))  # union
ser_i = pd.Series(np.intersect1d(ser1, ser2))  # intersect
ser_u[~ser_u.isin(ser_i)]

0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

In [8]:
# How to get the minimum, 25th percentile, median, 75th, and max of a numeric series?

ser = pd.Series(np.random.normal(10, 5, 25))
ser.describe()
np.percentile(ser, q=[0, 25, 50, 75, 100])


array([-0.55414741,  9.68496924, 11.4073868 , 14.14013674, 19.97604742])

In [9]:
# How to get frequency counts of unique items of a series?

ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))
ser.value_counts()


e    6
b    5
h    4
g    4
f    3
a    3
d    3
c    2
dtype: int64

In [10]:
# How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’?

ser = pd.Series(np.random.RandomState(100).randint(1, 5, [12]))

print(ser.value_counts())

topTwo = ser.value_counts().head(2).index

print("Top 2 Frequent numbers: -> ",topTwo )

# getting numbers that are not top two for assigning 'others'

ser = pd.Series(np.where(ser.isin(topTwo), ser, 'Other'))
# OR 
# ser[~ser.isin(topTwo)] = 'Other'

ser

4    4
1    4
3    3
2    1
dtype: int64
Top 2 Frequent numbers: ->  Int64Index([4, 1], dtype='int64')


0         1
1         1
2         4
3         4
4         4
5         4
6         1
7     Other
8     Other
9         1
10    Other
11    Other
dtype: object

In [11]:
# How to bin a numeric series to 10 groups of equal size?
ser = pd.Series(np.random.randint(1,100,20))
print(ser)
pd.qcut(ser, q=[0, .10, .20, .3, .4, .5, .6, .7, .8, .9, 1],labels=['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']).head()

0     78
1     97
2     33
3      4
4     24
5     38
6     72
7     54
8     95
9     78
10    17
11    24
12     3
13    63
14    32
15     6
16     7
17    81
18    28
19    65
dtype: int64


0     8th
1    10th
2     5th
3     1st
4     3rd
dtype: category
Categories (10, object): ['1st' < '2nd' < '3rd' < '4th' ... '7th' < '8th' < '9th' < '10th']

In [12]:
# How to convert a numpy array to a dataframe of given shape? 
# Reshape the series ser into a dataframe with 7 rows and 5 columns

ser = pd.Series(np.random.randint(1, 10, 35))

df = pd.DataFrame(ser.values.reshape(7,5))
print(df)

   0  1  2  3  4
0  6  5  9  6  6
1  4  7  1  5  4
2  8  1  4  8  8
3  1  2  2  5  9
4  1  1  4  7  6
5  4  7  5  2  7
6  7  9  1  6  5


In [13]:
# How to find the positions of numbers that are multiples of 3 from a series?
ser = pd.Series([6,8,6,7,6,2,4])

print(ser[ser.apply(lambda x : x % 3 == 0)].index) #shape is 1,3
# or

print(np.argwhere(ser.values % 3 == 0)) # shape is 3,1



Int64Index([0, 2, 4], dtype='int64')
[[0]
 [2]
 [4]]


In [14]:
#  How to extract items at given positions from a series

ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]

ser[pos]

0     a
4     e
8     i
14    o
20    u
dtype: object

In [15]:
# How to stack two series vertically and horizontally ?

ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))


pd.concat([ser1,ser2],axis = 0 ) # vertically

pd.concat([ser1,ser2],axis = 1 ) # horizontally

Unnamed: 0,0,1
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


In [16]:
# How to get the positions of items of series A in another series B?

ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

#sol 1
list(ser1[ser1.isin(ser2)].index)
#sol 2 
[pd.Index(ser1).get_loc(i) for i in ser2]



[5, 4, 0, 8]

In [17]:
# How to compute the mean squared error on a truth and predicted series?
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)
np.mean((truth-pred)**2)

0.2562214168772538

In [18]:
# How to convert the first character of each element in a series to uppercase?
ser = pd.Series(['how', 'to', 'kick', 'ass?'])
# ser.apply(str.title)
ser.map(lambda t : t.title())


0     How
1      To
2    Kick
3    Ass?
dtype: object

In [19]:
# How to calculate the number of characters in each word in a series?
ser = pd.Series(['how', 'to', 'kick', 'ass?'])
ser.apply(len)

0    3
1    2
2    4
3    4
dtype: int64

In [20]:
# How to compute difference of differences between consequtive numbers of a series?
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])
print(ser.diff().tolist())


[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]


In [21]:
# How to convert a series of date-strings to a timeseries?

ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

pd.to_datetime(ser)
# or
from dateutil.parser import parse
ser.map(lambda x: parse(x))


0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

In [22]:
# How to get the day of month, week number, day of year and day of week from a series of date strings?
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

ser = pd.to_datetime(ser)
ser.dt.day.values
ser.dt.weekofyear.values
ser.dt.dayofyear.values




  


array([  1,  33,  63,  94, 125, 157])

In [23]:
# How to filter words that contain atleast 2 vowels from a series?

ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])
v = ['a','e','i','o','u']

ser[ser.apply(lambda x:  sum( c in v for c in list(x.lower()))>= 2)]

ser

0     Apple
1    Orange
2      Plan
3    Python
4     Money
dtype: object

In [24]:
# How to filter valid emails from a series?
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

import re
mask = emails.map(lambda x: bool(re.match(pattern, x)))
emails[mask]

1    rameses@egypt.com
2            matt@t.co
3    narendra@modi.com
dtype: object

In [25]:
x = [[1,2],[3,4]]
y = [[2,4],[6,8]]

np.concatenate([x,y],axis =1)

array([[1, 2, 2, 4],
       [3, 4, 6, 8]])

In [26]:
x = [[1,2],[3,4]]
y = [[2,4],[6,8]]

np.concatenate([x,y],axis =0)

array([[1, 2],
       [3, 4],
       [2, 4],
       [6, 8]])

In [27]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[1, 2, 3])

pd.concat([ser1, ser2])



1    A
2    B
3    C
1    D
2    E
3    F
dtype: object

In [28]:
def make_df(cols, ind):
 """Quickly make a DataFrame"""
 data = {c: [str(c) + str(i) for i in ind]
 for c in cols}
 return pd.DataFrame(data, ind)

In [29]:
x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])

In [30]:
y.index = x.index
x


Unnamed: 0,A,B
0,A0,B0
1,A1,B1


In [31]:
y

Unnamed: 0,A,B
0,A2,B2
1,A3,B3


In [32]:
z = pd.concat([x, y],axis=1)
z

Unnamed: 0,A,B,A.1,B.1
0,A0,B0,A2,B2
1,A1,B1,A3,B3


In [33]:
z.loc[:,'A']

Unnamed: 0,A,A.1
0,A0,A2
1,A1,A3


In [34]:

ser = pd.Series(['shredder','shred','shrddr','shrdr'])
ser

0    shredder
1       shred
2      shrddr
3       shrdr
dtype: object

In [35]:

ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

pd.to_datetime(ser)
# or
from dateutil.parser import parse
ser.map(lambda x: parse(x))

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]