In [1]:
import numpy as np
import pandas as pd

#### 21. How to convert a series of date-strings to a timeseries?

In [2]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

In [3]:
ser.astype(np.datetime64)

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

In [4]:
pd.to_datetime(ser)

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

#### 22. How to get the day of month, week number, day of year and day of week from a series of date strings?

In [5]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])
from dateutil.parser import parse
df = ser.map(lambda d: parse(d))
df.head()

0   2010-01-01
1   2011-02-02
2   2012-03-03
3   2013-04-04
4   2014-05-05
dtype: datetime64[ns]

In [6]:
print("Date:", df.dt.day.tolist())
print("Week number: ", df.dt.week.tolist())
print("Day num of year: ", df.dt.dayofyear.tolist())
print("Day of week: ", df.dt.weekday_name.tolist())

Date: [1, 2, 3, 4, 5, 6]
Week number:  [53, 5, 9, 14, 19, 23]
Day num of year:  [1, 33, 63, 94, 125, 157]


  print("Week number: ", df.dt.week.tolist())


AttributeError: 'DatetimeProperties' object has no attribute 'weekday_name'

#### 23. How to convert year-month string to dates corresponding to the 4th day of the month?`m

In [7]:
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])
df = ser.map(lambda d: parse("04 " + d))
df

0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]

In [8]:
df = ser.map(lambda d: parse(d))
df = df.dt.year.astype('str') + "-"+ df.dt.month.astype('str')+ "-" +"04"
[parse(d).strftime("%Y-%m-%d") for d in df]

['2010-01-04', '2011-02-04', '2012-03-04']

#### 24. How to filter words that contain atleast 2 vowels from a series?

In [9]:
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

In [10]:
def at_least_two_vowels(word:str):
    counter = 0
    word = word.lower()
    vowels = ['a','e','i','o','u']
    for c in word:
        if c in vowels:
            counter+=1
    return counter>1



In [11]:
ser[ser.map(at_least_two_vowels)]

0     Apple
1    Orange
4     Money
dtype: object

In [12]:
from collections import Counter
ser[ser.map(lambda x: sum([Counter(x.lower()).get(i,0) for i in "aeiou"])>=2)]

0     Apple
1    Orange
4     Money
dtype: object

#### 25. How to filter valid emails from a series?

In [13]:
import re

In [14]:
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'
emails[emails.map(lambda e: re.match(pattern=pattern,string=e) !=None)]

1    rameses@egypt.com
2            matt@t.co
3    narendra@modi.com
dtype: object

In [15]:
help(re.match)

Help on function match in module re:

match(pattern, string, flags=0)
    Try to apply the pattern at the start of the string, returning
    a Match object, or None if no match was found.



In [16]:
emails.str.findall(pat=pattern, flags=re.IGNORECASE)

0                     []
1    [rameses@egypt.com]
2            [matt@t.co]
3    [narendra@modi.com]
dtype: object

#### 26. How to get the mean of a series grouped by another series?

In [17]:
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))
print(weights.tolist())
print(fruit.tolist())

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
['banana', 'banana', 'apple', 'banana', 'carrot', 'apple', 'apple', 'apple', 'apple', 'carrot']


In [18]:
weights.index = fruit

In [19]:
weights.groupby(weights.index).mean()

apple     6.600000
banana    2.333333
carrot    7.500000
dtype: float64

In [20]:
weights.groupby(fruit).mean()

Series([], dtype: float64)

#### 27. How to compute the euclidean distance between two series?

In [21]:
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

In [22]:
np.linalg.norm(p-q)

18.16590212458495

#### 28. How to find all the local maxima (or peaks) in a numeric series?

In [23]:
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])
ser[ser[::-1].diff()>0]

1    10
5    10
7     7
dtype: int64

In [24]:
diff = np.diff(ser)
diff

array([ 8, -7,  1,  5,  1, -8,  5, -4], dtype=int64)

In [25]:
dd = np.diff(np.sign(diff))

In [26]:
peak_locs = np.where(dd == -2)[0] + 1
peak_locs

array([1, 5, 7], dtype=int64)

#### 29. How to replace missing spaces in a string with the least frequent character?

In [27]:
my_str = 'dbc deb abed gade'

In [28]:
ser = pd.Series([c for c in my_str])
ser

0     d
1     b
2     c
3      
4     d
5     e
6     b
7      
8     a
9     b
10    e
11    d
12     
13    g
14    a
15    d
16    e
dtype: object

In [29]:
ser.value_counts().sort_values(ascending=False)

d    4
     3
b    3
e    3
a    2
g    1
c    1
dtype: int64

### 30. How to create a TimeSeries starting ‘2000-01-01’ and 10 weekends (saturdays) after that having random numbers as values?

In [30]:
from datetime import datetime

In [69]:
df = pd.DataFrame(columns=['Date','Numbers'])
df['Date'] = pd.date_range('2000-01-01',periods=10,freq='168h')
df['Numbers'] = df['Numbers'].map(lambda a: np.random.randint(10))
df

Unnamed: 0,Date,Numbers
0,2000-01-01,4
1,2000-01-08,5
2,2000-01-15,2
3,2000-01-22,3
4,2000-01-29,9
5,2000-02-05,2
6,2000-02-12,3
7,2000-02-19,0
8,2000-02-26,6
9,2000-03-04,6


In [70]:
pd.DataFrame(np.random.randint(1,10,10),pd.date_range('2000-01-01',periods=10,freq='168h'))

Unnamed: 0,0
2000-01-01,2
2000-01-08,4
2000-01-15,2
2000-01-22,2
2000-01-29,5
2000-02-05,2
2000-02-12,6
2000-02-19,1
2000-02-26,4
2000-03-04,8


In [63]:
24*7

168