In [2]:
import pandas as pd
import numpy as np


In [5]:
# How to convert a series of date-strings to a timeseries

ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

ser

pd.to_datetime(ser)

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

In [7]:
# How to get the day of month, week number, day of year and day of week from a series of date strings?

ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

from dateutil.parser import parse
#ser_ts = ser.map(lambda x: parse(x))

ser_ts =pd.to_datetime(ser)

# day of month
print("Date: ", ser_ts.dt.day.tolist())

# week number
print("Week number: ", ser_ts.dt.weekofyear.tolist())

# day of year
print("Day number of year: ", ser_ts.dt.dayofyear.tolist())

# day of week
print("Day of week: ", ser_ts.dt.weekday_name.tolist())

Date:  [1, 2, 3, 4, 5, 6]
Week number:  [53, 5, 9, 14, 19, 23]
Day number of year:  [1, 33, 63, 94, 125, 157]
Day of week:  ['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Monday', 'Saturday']


In [13]:
# How to convert year-month string to dates corresponding to the 4th day of the month

ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])

ser_ts =pd.to_datetime(ser)

ser_datestr = ser_ts.dt.year.astype('str') + '-' + ser_ts.dt.month.astype('str') + '-' + '04'
#(or)
ser_ts.dt.dayofyear.astype('str') + '-' + ser_ts.dt.month.astype('str') + '-' + '04'
ser_datestr


0    2010-1-04
1    2011-2-04
2    2012-3-04
dtype: object

In [15]:
# How to filter words that contain atleast 2 vowels from a series

ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

from collections import Counter
mask = ser.map(lambda x: sum([Counter(x.lower()).get(i, 0) for i in list('aeiou')]) >= 2)
ser[mask]

0     Apple
1    Orange
4     Money
dtype: object

In [19]:
# How to filter valid emails from a series

emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

# Solution 1 (as series of strings)
import re
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'
mask = emails.map(lambda x: bool(re.match(pattern, x)))
emails[mask]

# Solution 2 (as series of list)
emails.str.findall(pattern, flags=re.IGNORECASE)

# Solution 3 (as list)
[x[0] for x in [re.findall(pattern, email) for email in emails] if len(x) > 0]

['rameses@egypt.com', 'matt@t.co', 'narendra@modi.com']

In [22]:
# How to get the mean of a series grouped by another series

fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weight = pd.Series(np.linspace(1, 10, 10))
print(weight.tolist())
print(fruit.tolist())

weight.groupby(fruit).mean()

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
['apple', 'carrot', 'carrot', 'apple', 'apple', 'apple', 'apple', 'apple', 'banana', 'apple']


apple     5.857143
banana    9.000000
carrot    2.500000
dtype: float64

In [25]:
#  How to compute the euclidean distance between two series

p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

# Solution 
sum((p - q)**2)**.5

# Solution (using func)
np.linalg.norm(p-q)

18.16590212458495

In [27]:
# How to find all the local maxima (or peaks) in a numeric series

ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

dd = np.diff(np.sign(np.diff(ser)))
peak_locs = np.where(dd == -2)[0] + 1
peak_locs

array([1, 5, 7], dtype=int64)

In [28]:
# How to replace missing spaces in a string with the least frequent character

y_str = 'dbc deb abed gade'

# Solution
ser = pd.Series(list('dbc deb abed gade'))
freq = ser.value_counts()
print(freq)
least_freq = freq.dropna().index[-1]
"".join(ser.replace(' ', least_freq))

d    4
     3
e    3
b    3
a    2
g    1
c    1
dtype: int64


'dbccdebcabedcgade'

In [29]:
# How to create a TimeSeries starting ‘2000-01-01’ and 10 weekends (saturdays) after that having random numbers as values

ser = pd.Series(np.random.randint(1,10,10), pd.date_range('2000-01-01', periods=10, freq='W-SAT'))
ser

2000-01-01    6
2000-01-08    1
2000-01-15    3
2000-01-22    3
2000-01-29    8
2000-02-05    3
2000-02-12    5
2000-02-19    6
2000-02-26    7
2000-03-04    5
Freq: W-SAT, dtype: int32