# Python Pandas Exercises
Obtained from https://www.machinelearningplus.com/python/101-pandas-exercises-python/

## 1. How to import pandas and check the version?

In [36]:
import pandas as pd
print('Version: ' + pd.__version__)

Version: 1.4.1


## 2. How to create a series from a list, numpy array and dict?

In [37]:
# Input
import numpy as np
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

# Solution
lst_ser = pd.Series(mylist)
arr_ser = pd.Series(myarr)
dic_ser = pd.Series(mydict)

## 3. How to convert the index of a series into a column of a dataframe?



In [40]:
# Input
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

# Solution
ser.to_frame().reset_index(inplace=True)
ser.head()

a    0
b    1
c    2
e    3
d    4
dtype: int64

In [45]:
# Input
import numpy as np
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

# Solution
df = pd.concat([ser1, ser2], axis = 1)
df.head()

Unnamed: 0,0,1
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4


## 5. How to assign name to the series’ index?



In [47]:
# Input
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))

# Solution
ser.name='alphabets'
ser.head()

0    a
1    b
2    c
3    e
4    d
Name: alphabets, dtype: object

In [30]:
lst = ["1", "4", "0", "6", "9"]
def sort_lst(lst):
    lst_int = [int(i) for i in lst]
    lst_int.sort()
    return print(lst_int)

sort_lst(lst)

[0, 1, 4, 6, 9]


## 6. How to get the items of series A not present in series B?



In [51]:
# Input
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# Solution
ser1[~ser1.isin(ser2)]

0    1
1    2
2    3
dtype: int64

## 7. How to get the items not common to both series A and series B?

In [52]:
# Input
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# Solution
ser_u = pd.Series(np.union1d(ser1, ser2))
ser_i = pd.Series(np.intersect1d(ser1, ser2))
ser_u[~ser_u.isin(ser_i)]

0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

## 8. How to get the minimum, 25th percentile, median, 75th, and max of a numeric series?

In [57]:
# Input
ser = pd.Series(np.random.normal(10, 5, 25))

# Solution
ser.quantile(0.25)
ser.quantile(0.5)
ser.quantile(0.75)
ser.quantile(1)

18.956274387478057

## 9. How to get frequency counts of unique items of a series?



In [60]:
# Input
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))

# Solution
ser.value_counts()

a    8
g    7
d    3
e    3
h    3
f    3
b    2
c    1
dtype: int64

## 10. How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’?

In [73]:
# Input 
np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))

# Solution
ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'
print('Top 2 Frequencies: ',ser.value_counts())



Top 2 Frequencies:  Other    5
4        4
1        3
dtype: int64


## 11. How to bin a numeric series to 10 groups of equal size?



In [79]:
# Input
ser = pd.Series(np.random.random(20))

# Solution
pd.qcut(ser,[0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
        labels = ['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']).head()

0     3rd
1     1st
2    10th
3     8th
4     7th
dtype: category
Categories (10, object): ['1st' < '2nd' < '3rd' < '4th' ... '7th' < '8th' < '9th' < '10th']

## 12. How to convert a numpy array to a dataframe of given shape? (L1)

In [85]:
# Input
ser = pd.Series(np.random.randint(1, 10, 35))

# Solution
df = pd.DataFrame(ser.values.reshape(7,5))
df.head()

Unnamed: 0,0,1,2,3,4
0,6,2,9,9,9
1,5,1,2,7,6
2,2,6,3,6,2
3,5,5,5,5,6
4,5,2,2,3,1


## 13. How to find the positions of numbers that are multiples of 3 from a series?

In [90]:
# Input
ser = pd.Series(np.random.randint(1, 10, 7))

# Solution
ser[ser%3 == 0].index

Int64Index([2, 3], dtype='int64')

## 14. How to extract items at given positions from a series



In [91]:
# Input
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]

# Solution
ser[pos]

0     a
4     e
8     i
14    o
20    u
dtype: object

## 15. How to stack two series vertically and horizontally ?



In [None]:
# Input
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

# Solution
## Vertically
pd.concat([ser1, ser2])
## Horizontally
pd.DataFrame([ser1, ser2])

## 16. How to get the positions of items of series A in another series B?

In [93]:
# Input
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

# Solution
ser1[ser1.isin(ser2)].index

Int64Index([0, 4, 5, 8], dtype='int64')

## 17. How to compute the mean squared error on a truth and predicted series?

In [94]:
# Input
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

# Solution
mse = (truth-pred)**2
print(mse)

0    0.420860
1    0.157039
2    0.000976
3    0.857471
4    0.266347
5    0.563214
6    0.708747
7    0.061519
8    0.177393
9    0.949940
dtype: float64


## 18. How to convert the first character of each element in a series to uppercase?

In [96]:
# Input
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

# Solution
ser.map(lambda x: x.title()) 

0     How
1      To
2    Kick
3    Ass?
dtype: object

## 19. How to calculate the number of characters in each word in a series?

In [97]:
# Input
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

# Solution
ser.map(lambda x: len(x))

0    3
1    2
2    4
3    4
dtype: int64

## 20. How to compute difference of differences between consequtive numbers of a series?

In [99]:
# Input
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])

# Solution
list(ser.diff())

[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]

## 21. How to convert a series of date-strings to a timeseries?

In [103]:
# Input
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

# Solution
pd.to_datetime(ser)

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

## 22. How to get the day of month, week number, day of year and day of week from a series of date strings?

In [None]:
# Input
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

# Solution (From machinelearningplus)
from dateutil.parser import parse
ser_ts = ser.map(lambda x: parse(x))

# day of month
print("Date: ", ser_ts.dt.day.tolist())

# week number
print("Week number: ", ser_ts.dt.weekofyear.tolist())

# day of year
print("Day number of year: ", ser_ts.dt.dayofyear.tolist())

# day of week
print("Day of week: ", ser_ts.dt.weekday_name.tolist())

## 23. How to convert year-month string to dates corresponding to the 4th day of the month?

In [106]:
# Input
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])

# Solution
from dateutil.parser import parse
ser.map(lambda x: parse('04 ' + x))

0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]

## 24. How to filter words that contain atleast 2 vowels from a series?

In [114]:
# Input
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

# Solution
from collections import Counter
mask = ser.map(lambda x: sum([Counter(x.lower()).get(i,0) for i in list('aeiou')]) >= 2)
ser[mask]

0     True
1     True
2    False
3    False
4     True
dtype: bool

## 25. How to filter valid emails from a series?



In [115]:
# Input
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

# Solution
import re
mask = emails.map(lambda x: bool(re.match(pattern, x)))
emails[mask]

1    rameses@egypt.com
2            matt@t.co
3    narendra@modi.com
dtype: object

## 26. How to get the mean of a series grouped by another series?

In [117]:
# Input
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))
print(weights.tolist())
print(fruit.tolist())

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
['banana', 'banana', 'carrot', 'banana', 'banana', 'banana', 'banana', 'apple', 'banana', 'banana']


In [120]:
weights.groupby(fruit).mean()

apple     8.0
banana    5.5
carrot    3.0
dtype: float64

## 27. How to compute the euclidean distance between two series?

In [124]:
# Input
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

round(np.sqrt(sum((p-q)**2)),2)

18.17

## 28. How to find all the local maxima (or peaks) in a numeric series?

In [127]:
# Input
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

# Solution
ser[ser == max(ser)].index

Int64Index([1, 5], dtype='int64')

## 29. How to replace missing spaces in a string with the least frequent character?

In [130]:
# Input
my_str = 'dbc deb abed gade'

# Solution
ser = pd.Series(list(my_str))
freq = ser.value_counts()
print(freq)
l_freq = freq.dropna().index[-1]
my_str.replace(" ", l_freq)

d    4
b    3
     3
e    3
a    2
c    1
g    1
dtype: int64


'dbcgdebgabedggade'

## 30. How to create a TimeSeries starting ‘2000-01-01’ and 10 weekends (saturdays) after that having random numbers as values?

