https://www.machinelearningplus.com/python/101-pandas-exercises-python/

In [1]:
import numpy as np
import pandas as pd


# Create a pandas series from each of the items below: a list, numpy and a dict. -

In [5]:
mylist = list('dfkjgrthgfskldjnfgjnrltjng')
myarray = np.arange(26)
mydict = dict(zip(mylist, myarray))

series1 = pd.Series(mylist)
series2 = pd.Series(myarray)
series3 = pd.Series(mydict)

print(series3.head())


d    13
f    16
k    11
j    23
g    25
dtype: int64


# Convert the index of a series into a column of a dataframe.

In [8]:
# Input
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

# Solution
df = ser.to_frame().reset_index()
print(df.head())

  index  0
0     a  0
1     b  1
2     c  2
3     e  3
4     d  4


How to combine many series to form a dataframe?


In [7]:
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

#solution 1
df = pd.DataFrame({'col1': ser1, 'col2': ser2})
print(df.head())



  col1  col2
0    a     0
1    b     1
2    c     2
3    e     3
4    d     4


In [12]:
#solution 2
df = pd.concat([ser1, ser2], axis=1)
df.head()

Unnamed: 0,0,1
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4


# How to assign name to series' index?

In [13]:
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))

#solution
ser.name = 'alphabets'
ser.head()

0    a
1    b
2    c
3    e
4    d
Name: alphabets, dtype: object

# How to get the items of series A not present in series B?

In [16]:
#From ser1 remove items present in ser2. 
ser1 = pd.Series([1, 2, 3, 4, 5]) # 4 and 5 are removed since in ser2. 
ser2 = pd.Series([4, 5, 6, 7, 8])

#solution

ser1[~ser1.isin(ser2)]

0    1
1    2
2    3
dtype: int64

# How to get the items not common to both series A and seried B?

In [18]:
#Get all items of ser1 and ser2 not common to both. 
ser1 = pd.Series([1, 2, 3, 4, 5])

#solution
ser_u = pd.Series(np.union1d(ser1, ser2)) #union
ser_i = pd.Series(np.intersect1d(ser1, ser2))
ser_u[~ser_u.isin(ser_i)]

0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

# How to get the minimum, 25th percentile, median, 75th, and max of a numeric series?

# How to get frequency counts of unique items of series?

In [22]:
#Calculate the frequency counts of each unique value in series. 
ser = pd.Series(np.take(list('abcdefg'), np.random.randint(8, size=30)))

#solution
ser.value_counts()

IndexError: index 7 is out of bounds for axis 0 with size 7

# How to keep only top 2 most frequent values as it is and replace everything else as 'Other'?

In [3]:
np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))
ser

0     2
1     1
2     3
3     4
4     1
5     4
6     3
7     4
8     3
9     3
10    4
11    2
dtype: int64

In [5]:
#solution
print("Top 2 Freq:", ser.value_counts())
ser[-ser.isin(ser.value_counts().index[:2])] = "Other"
ser

Top 2 Freq: 4    4
3    4
2    2
1    2
dtype: int64


0     Other
1     Other
2         3
3         4
4     Other
5         4
6         3
7         4
8         3
9         3
10        4
11    Other
dtype: object

# How to bin a numeric series to 10 groups of equal size?

In [7]:
ser = pd.Series(np.random.random(20))
ser

0     0.576419
1     0.581845
2     0.392901
3     0.707007
4     0.535363
5     0.305515
6     0.902075
7     0.644242
8     0.598390
9     0.029701
10    0.160759
11    0.120439
12    0.984369
13    0.314675
14    0.719733
15    0.500711
16    0.547039
17    0.984582
18    0.112199
19    0.052107
dtype: float64

In [14]:
#Solution
pd.qcut(ser, q=[0, .10, .20, .3, .4, .5, .6, .7, .8, .9, 1],
        labels=['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th'])


0      6th
1      7th
2      4th
3      8th
4      5th
5      3rd
6      9th
7      8th
8      7th
9      1st
10     3rd
11     2nd
12    10th
13     4th
14     9th
15     5th
16     6th
17    10th
18     2nd
19     1st
dtype: category
Categories (10, object): [1st < 2nd < 3rd < 4th ... 7th < 8th < 9th < 10th]

# How to convert a numpy array to dataframe of given shape?

In [24]:
#reshape the series ser into a dataframe with 7 rows and 5 columns. 
ser = pd.Series(np.random.randint(1, 10, 35)).head()

ValueError: cannot reshape array of size 5 into shape (7,5)

In [23]:
#Solution
df = pd.DataFrame(ser.values.reshape(7, 5))
print(df)


ValueError: cannot reshape array of size 5 into shape (7,5)

# How to find the position of number that are multiples of 3 from a series?


In [26]:
ser = pd.Series(np.random.randint(1, 10, 7))
ser

0    6
1    8
2    9
3    4
4    2
5    1
6    8
dtype: int64

In [28]:
#solution
print(ser)
np.argwhere(ser % 3==0)

0    6
1    8
2    9
3    4
4    2
5    1
6    8
dtype: int64


array([[0],
       [2]])