In [9]:
import pandas
## reference: https://www.machinelearningplus.com/python/101-pandas-exercises-python/
## pls check the original website
## thanks to the author

### Question 1. How to import pandas and check version?


In [10]:
print(pandas.__version__)

0.24.2


In [13]:
import pandas as pd
pd.show_versions(as_json=False)


INSTALLED VERSIONS
------------------
commit: None
python: 3.7.3.final.0
python-bits: 64
OS: Windows
OS-release: 10
machine: AMD64
processor: Intel64 Family 6 Model 158 Stepping 10, GenuineIntel
byteorder: little
LC_ALL: None
LANG: None
LOCALE: None.None

pandas: 0.24.2
pytest: 5.0.1
pip: 19.1.1
setuptools: 41.0.1
Cython: 0.29.12
numpy: 1.16.4
scipy: 1.2.1
pyarrow: None
xarray: None
IPython: 7.6.1
sphinx: 2.1.2
patsy: 0.5.1
dateutil: 2.8.0
pytz: 2019.1
blosc: None
bottleneck: 1.2.1
tables: 3.5.2
numexpr: 2.6.9
feather: None
matplotlib: 3.1.0
openpyxl: 2.6.2
xlrd: 1.2.0
xlwt: 1.3.0
xlsxwriter: 1.1.8
lxml.etree: 4.3.4
bs4: 4.7.1
html5lib: 1.0.1
sqlalchemy: 1.3.5
pymysql: None
psycopg2: None
jinja2: 2.10.1
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: 0.8.0
gcsfs: None


In [17]:
pip show pandas

Name: pandas
Version: 0.24.2
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: http://pandas.pydata.org
Author: None
Author-email: None
License: BSD
Location: c:\users\ruire\appdata\local\continuum\anaconda3\lib\site-packages
Requires: pytz, numpy, python-dateutil
Required-by: statsmodels, seaborn, pandas-datareader
Note: you may need to restart the kernel to use updated packages.


### Question 2. How to create a series from a list, numpy array and dict?

In [18]:
import numpy as np

In [19]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

In [23]:
mydict

{'a': 0,
 'b': 1,
 'c': 2,
 'e': 3,
 'd': 4,
 'f': 5,
 'g': 6,
 'h': 7,
 'i': 8,
 'j': 9,
 'k': 10,
 'l': 11,
 'm': 12,
 'n': 13,
 'o': 14,
 'p': 15,
 'q': 16,
 'r': 17,
 's': 18,
 't': 19,
 'u': 20,
 'v': 21,
 'w': 22,
 'x': 23,
 'y': 24,
 'z': 25}

In [24]:
# Answer
df = pd.DataFrame(index=mydict.keys())

In [26]:
df['value'] = mydict.values()

In [28]:
# Solution
ser1 = pd.Series(mylist)
ser2 = pd.Series(myarr)
ser3 = pd.Series(mydict)

### Question 3. How to convert the index of a series into a column of a dataframe?

- Convert the series ser into a dataframe with its index as another column on the dataframe

In [30]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

In [37]:
# This will not save the index as another column
df = pd.DataFrame(ser)

In [38]:
df

Unnamed: 0,0
a,0
b,1
c,2
e,3
d,4
f,5
g,6
h,7
i,8
j,9


In [34]:
df2 = ser.to_frame().reset_index()

In [39]:
df2

Unnamed: 0,index,0
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4
5,f,5
6,g,6
7,h,7
8,i,8
9,j,9


### Question 4: How to combine many series to form a dataframe?


In [40]:
import numpy as np
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

In [85]:
# Solution 1 --> add series and keys
df1 = pd.concat([ser1, ser2], axis = 1, keys = ['col1', 'col2'], names=['Index'])

In [86]:
# Solution 2 --> we can also use dictionary to construct dataframe
df2 = pd.DataFrame({'col1': ser1, 'col2': ser2})

In [87]:
df3 = pd.concat([ser1, ser2], axis=1, ignore_index=True)

### Question 5: How to assign name to the series index?


In [99]:
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))

In [104]:
# Answer
ser.columns = 'alphabets'

In [108]:
ser.name = 'alphabets'

In [112]:
ser.name = 'b'

#### Notice
- only pandas DataFrame has the function of rename
- df.rename(index = {0 : 'a'}, columns = {1: 'b'})

### Question 6. How to get the items of series A not present in series B?

In [128]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

In [130]:
# Answer
ser1[~ser1.isin(ser2)]

0    1
1    2
2    3
dtype: int64

In [131]:
~ser1.isin(ser2)

0     True
1     True
2     True
3    False
4    False
dtype: bool

### Question 7. How to get the items not common to both series A and series B?

In [129]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

0    4
1    5
2    6
3    7
4    8
dtype: int64

In [145]:
# answer  --> we use the union and intersection style function
ser_u = pd.Series(np.union1d(ser1, ser2))

In [146]:
ser_i = pd.Series(np.intersect1d(ser1, ser2))
ser_u[~ser_u.isin(ser_i)]

0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

In [150]:
# but we can use another function--> the common element
from functools import reduce
reduce(np.intersect1d, [ser1], [ser2])

array([4, 5], dtype=int64)

### Question 8. How to get the minimum, 25th percentile, median, 75th, and max of a numeric series? 

In [147]:
ser = pd.Series(np.random.normal(10, 5, 25))

In [153]:
# answer 1: we can use describe method
ser.describe()

count    25.000000
mean      9.222386
std       3.987396
min       3.130906
25%       5.674258
50%       9.466771
75%      11.950334
max      17.861904
dtype: float64

In [154]:
# answer 2: 
np.percentile(ser, q=[0, 25, 50, 75, 100])

array([ 3.13090625,  5.67425771,  9.46677069, 11.95033411, 17.86190381])

### Question 9. How to get frequency counts of unique items of a series?

In [155]:
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))

In [164]:
ser.value_counts()

g    7
d    6
a    5
e    4
f    4
c    2
b    1
h    1
dtype: int64

### Question 10. How to keep only top 2 most frequent values as it is and replace everything else as 'other'?

In [165]:
np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))

In [172]:
# Answer
print('Top 2 Freq:', sv)

array([3, 2, 2, 2, 4, 3, 2, 2, 2, 3, 1, 1])