In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
from datetime import datetime
import IPython.display 

### Cheat sheet

In [8]:
IPython.display.IFrame("https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf" ,width="800", height="600")

### Series

#### Creating series from array

In [12]:

ser = pd.Series()
print("Pandas Series: ", ser)

# simple array
data = np.array(['g', 'e', 'e', 'k', 's'])
	
ser = pd.Series(data)
print("Pandas Series:\n", ser)

Pandas Series:  Series([], dtype: object)
Pandas Series:
 0    g
1    e
2    e
3    k
4    s
dtype: object


In [56]:
ser.index , ser.values   # index -- row labels (default) or row labels index

(RangeIndex(start=0, stop=5, step=1),
 array(['g', 'e', 'e', 'k', 's'], dtype=object))

#### Create series from dic

In [55]:

zip_codes ={"100":"中正區","103":"大同區","104":"中山區","105":"松山區","106":"大安區","108":"萬華區","110":"信義區","111":"士林區","112":"北投區","114":"內湖區","115":"南港區","116":"文山區" } 
code = pd.Series(zip_codes)
code

100    中正區
103    大同區
104    中山區
105    松山區
106    大安區
108    萬華區
110    信義區
111    士林區
112    北投區
114    內湖區
115    南港區
116    文山區
dtype: object

In [15]:
code.index , code.values

(Index(['100', '103', '104', '105', '106', '108', '110', '111', '112', '114',
        '115', '116'],
       dtype='object'),
 array(['中正區', '大同區', '中山區', '松山區', '大安區', '萬華區', '信義區', '士林區', '北投區',
        '內湖區', '南港區', '文山區'], dtype=object))

In [16]:
code[2] , code['104'] , code[2] == code['104'] # acess via label or index

('中山區', '中山區', True)

In [17]:
code[2:8] , code[7]

(104    中山區
 105    松山區
 106    大安區
 108    萬華區
 110    信義區
 111    士林區
 dtype: object,
 '士林區')

In [18]:
code.head(-1) , code.head(2)

(100    中正區
 103    大同區
 104    中山區
 105    松山區
 106    大安區
 108    萬華區
 110    信義區
 111    士林區
 112    北投區
 114    內湖區
 115    南港區
 dtype: object,
 100    中正區
 103    大同區
 dtype: object)

#### Series operation

In [19]:
index = {"peko" , "miko" , "godtone" , "kuuga"}
data_s1 = pd.Series( [1,0.2,"r",1138918238] , index)
data_s2 = pd.Series( [1234,0.2,3.240201,21.12459] , index)

data_s1+ data_s1  , "------------",data_s2-data_s2 ,"---------------" , data_s2 *data_s2

(godtone             2
 peko              0.4
 kuuga              rr
 miko       2277836476
 dtype: object,
 '------------',
 godtone    0.0
 peko       0.0
 kuuga      0.0
 miko       0.0
 dtype: float64,
 '---------------',
 godtone    1.522756e+06
 peko       4.000000e-02
 kuuga      1.049890e+01
 miko       4.462483e+02
 dtype: float64)

One can refer to https://www.geeksforgeeks.org/python-pandas-series/?ref=lbp for more operations

#### Misc property

In [20]:
#change to list
data_s1.tolist()

[1, 0.2, 'r', 1138918238]

In [21]:
#data is mutable
data_s1["peko"]=1
data_s1["peko"]

1

### Data frame

In [22]:
# Calling DataFrame constructor
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


#### DataFrame from dict narray / lists

In [43]:
# initialise data of lists.
data_dict ={'name':["peko", "miko", "suiesei", "godtone"],
        'age': [110, 35, 4, 50],
        'weight':[90, 40, 80, 87]}
# Create DataFrame
df = pd.DataFrame(data_dict)
# Print the output.
df
# index--row labels , key--column labels , values -- talbe element 

Unnamed: 0,name,age,weight
0,peko,110,90
1,miko,35,40
2,suiesei,4,80
3,godtone,50,87


In [54]:
df.keys()  , df.index

(Index(['name', 'age', 'weight'], dtype='object'),
 RangeIndex(start=0, stop=4, step=1))

In [25]:
pd.Series(data_dict)

name      [peko, miko, suiesei, godtone]
age                     [110, 35, 4, 50]
weight                  [90, 40, 80, 87]
dtype: object

In [26]:
print( "its data type is" , type(df['name']))
df['name'] 

its data type is <class 'pandas.core.series.Series'>


0       peko
1       miko
2    suiesei
3    godtone
Name: name, dtype: object

In [51]:
df[['weight','name']] #this is a (sub)dataframe

Unnamed: 0,weight,name
0,90,peko
1,40,miko
2,80,suiesei
3,87,godtone


In [47]:
df.describe()

Unnamed: 0,age,weight
count,4.0,4.0
mean,49.75,74.25
std,44.5,23.214579
min,4.0,40.0
25%,27.25,70.0
50%,42.5,83.5
75%,65.0,87.75
max,110.0,90.0


In [48]:
df["age"].describe()

count      4.00
mean      49.75
std       44.50
min        4.00
25%       27.25
50%       42.50
75%       65.00
max      110.00
Name: age, dtype: float64

### CSV example

In [2]:
# making data frame
data = pd.read_csv("https://media.geeksforgeeks.org/wp-content/uploads/nba.csv")
 #  set index_col ="Name " or other keys (columns) to set the indices to values of that key (column)  

# calling head() method
# storing in new variable
data_top = data.head()
data_end = data.tail()
# display
data_top


Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [72]:
data_end

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [73]:
data.describe()

Unnamed: 0,Number,Age,Weight,Salary
count,457.0,457.0,457.0,446.0
mean,17.678337,26.938731,221.522976,4842684.0
std,15.96609,4.404016,26.368343,5229238.0
min,0.0,19.0,161.0,30888.0
25%,5.0,24.0,200.0,1044792.0
50%,13.0,26.0,220.0,2839073.0
75%,25.0,30.0,240.0,6500000.0
max,99.0,40.0,307.0,25000000.0


### Acess row elements (.loc , .iloc)


#### loc
Acess via column key names

In [None]:
help(pd.DataFrame.loc)

In [93]:

data.loc[0 , "Name" ] , "-----" , data.loc[0 , ["Name" , "College"]]  , "-----", data.loc[0]
#acess a row elements or the row

('Avery Bradley',
 '-----',
 Name       Avery Bradley
 College            Texas
 Name: 0, dtype: object,
 '-----',
 Name         Avery Bradley
 Team        Boston Celtics
 Number                 0.0
 Position                PG
 Age                   25.0
 Height                 6-2
 Weight               180.0
 College              Texas
 Salary           7730337.0
 Name: 0, dtype: object)

In [9]:
#array of all players' ages
data.loc[:,'Age'].values

array([25., 25., 27., 22., 29., 29., 21., 25., 22., 22., 24., 27., 27.,
       20., 26., 27., 24., 28., 21., 32., 22., 26., 23., 28., 21., 26.,
       25., 26., 28., 27., 30., 33., 23., 32., 34., 25., 24., 23., 28.,
       26., 20., 26., 28., 32., 25., 23., 37., 25., 25., 22., 22., 22.,
       32., 24., 24., 22., 20., 27., 22., 25., 20., 23., 20., 29., 26.,
       29., 24., 30., 23., 27., 23., 25., 36., 29., 24., 24., 33., 24.,
       31., 25., 28., 26., 26., 32., 30., 20., 23., 30., 28., 26., 33.,
       27., 29., 36., 23., 29., 27., 28., 27., 29., 31., 38., 39., 31.,
       23., 25., 31., 24., 23., 37., 24., 29., 33., 25., 23., 21., 20.,
       27., 29., 36., 31., 26., 19., 28., 33., 21., 25., 24., 22., 27.,
       25., 32., 30., 31., 22., 23., 25., 27., 30., 36., 27., 22., 28.,
       25., 25., 24., 29., 27., 23., 24., 30., 25., 31., 26., 35., 23.,
       35., 30., 27., 24., 25., 27., 31., 21., 27., 24., 25., 33., 24.,
       31., 35., 35., 35., 31., 27., 25., 29., 25., 30., 25., 33

Sometimes, to change the value of some column, you should use loc instead of directly assign value

In [None]:
df[df['A'] >2]['A'] = 4
df.loc[df.A >2 , 'A'  ] = 4

If you have multiple condition, you should combine them into one, e.g

In [None]:
# 假設這是你的 DataFrame
df = pd.DataFrame({
    'A': [20, 55, 75, 10, 90, 45, 60],
    'B': ['cat', 'cat', 'dog', 'dog', 'dog', 'cat', 'dog']
})

# 使用 .loc 來找到列 A 中值大於 50 且列 B 為 'dog' 的行，然後對這些行的 A 列值增加 100
df.loc[(df['A'] > 50) & (df['B'] == 'dog'), 'A'] += 100

print(df)

#### iloc
Acess via column index.

In [4]:
data.iloc[0 , 0] , "-----" , data.iloc[0 , [0 , 1,2]]  , "-----", data.iloc[0]


('Avery Bradley',
 '-----',
 Name       Avery Bradley
 Team      Boston Celtics
 Number               0.0
 Name: 0, dtype: object,
 '-----',
 Name         Avery Bradley
 Team        Boston Celtics
 Number                 0.0
 Position                PG
 Age                   25.0
 Height                 6-2
 Weight               180.0
 College              Texas
 Salary           7730337.0
 Name: 0, dtype: object)

In [6]:
#array of all players' ages
data.iloc[:,4].values

array([25., 25., 27., 22., 29., 29., 21., 25., 22., 22., 24., 27., 27.,
       20., 26., 27., 24., 28., 21., 32., 22., 26., 23., 28., 21., 26.,
       25., 26., 28., 27., 30., 33., 23., 32., 34., 25., 24., 23., 28.,
       26., 20., 26., 28., 32., 25., 23., 37., 25., 25., 22., 22., 22.,
       32., 24., 24., 22., 20., 27., 22., 25., 20., 23., 20., 29., 26.,
       29., 24., 30., 23., 27., 23., 25., 36., 29., 24., 24., 33., 24.,
       31., 25., 28., 26., 26., 32., 30., 20., 23., 30., 28., 26., 33.,
       27., 29., 36., 23., 29., 27., 28., 27., 29., 31., 38., 39., 31.,
       23., 25., 31., 24., 23., 37., 24., 29., 33., 25., 23., 21., 20.,
       27., 29., 36., 31., 26., 19., 28., 33., 21., 25., 24., 22., 27.,
       25., 32., 30., 31., 22., 23., 25., 27., 30., 36., 27., 22., 28.,
       25., 25., 24., 29., 27., 23., 24., 30., 25., 31., 26., 35., 23.,
       35., 30., 27., 24., 25., 27., 31., 21., 27., 24., 25., 33., 24.,
       31., 35., 35., 35., 31., 27., 25., 29., 25., 30., 25., 33

### Resample
Example of resampled dataframes with different parameters and methods.

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
# Create a time series with data in seconds
time = pd.to_datetime(['2022-01-01 00:01:01', '2022-01-01 00:01:02', '2022-01-01 00:01:03' , '2022-01-01 00:01:11' ,
                    '2022-01-01 00:01:17', '2022-01-01 00:01:24','2022-01-01 00:01:50', '2022-01-01 00:01:51',
                       '2022-01-01 00:02:00' ,'2022-01-01 00:02:02'
                    , '2022-01-01 00:02:34', '2022-01-01 00:02:35', '2022-01-01 00:02:40', '2022-01-01 00:03:01', 
                    '2022-01-01 00:03:41' , '2022-01-01 00:04:59','2022-01-01 00:06:19'])
ts = pd.Series(  np.append(2.3,np.arange(len(time)-1))
               , index=time)
ts2 = pd.Series(np.arange(len(time))**2, index=time)
ts3 = pd.Series( (np.arange(len(time),dtype=float) +0.0123)**(-1), index=time)

df = pd.DataFrame({'A':ts , 'B' :ts2 , "C": ts3 },index=time )

# Resample to one-minute intervals and sum the values within each minute
res1 = df[['A','C'] ].resample('T').sum()
res2= df[['A','C'] ].resample('T',label='right').sum()
res3= df[['A','C'] ].resample('T',closed='right').sum()
res4= df[['A','C'] ].resample('T',label='right',closed='right').sum()

print("Original:")
print(df.head(15))

print("\nResampled :")
print(res1.head(10))  

print("\nResampled , label =r:")
print(res2.head(10))  

print("\nResampled , closed =r:")
print(res3.head(10))  

print("\nResampled , label = closed =r:")
print(res4.head(10))  

Combination of label, closed and ffill give strange results, don't know why.

In [None]:
res5 = df['B' ].resample('T').ffill()
res5_shift = df['B' ].resample('T').ffill().shift(-1)
res5_last_shift = df['B' ].resample('T').last().ffill()
res6= df['B' ].resample('T',label='right').ffill()
res7= df['B' ].resample('T', closed='right').ffill()
res8= df['B' ].resample('T',label='right',closed='right').ffill()

res9 = df['B'].resample('T').bfill()

print("Original:")
print(df)

print("\nResampled ffill")
print(res5.head(10))  

print("\nResampled ffill shift -1")
print(res5_shift.head(10))  

print("\nResampled last,ffill shift -1")
print(res5_last_shift.head(10))  

print("\nResampled ffill , label= r:")
print(res6.head(10))  

print("\nResampled ffill , closed= r:")
print(res7.head(10))  

print("\nResampled ffill ,label=closed=r :")
print(res8.head(10))  

print("\nResampled bfil :")
print(res9.head(10))  



### Apply
Apply function to a specific part of the daraframe/series.

DataFrame.apply(func, axis=0, raw=False, result_type=None, args=(), by_row='compat', engine='python', engine_kwargs=None, **kwargs)

```axis``` : 0= each column as object to input the function, 1 = each row as object to input the function
axis = 0 ==> In apply(lambd x: ), each column is x 

### Groupby


In [28]:
import pandas as pd

# 创建一个示例 DataFrame
df = pd.DataFrame({
    'Category': ['A', 'A', 'A','B', 'B', 'C', 'A','C'],
    'Type': ['X', 'Y', 'X','X', 'Y', 'X', 'Y','Y'],
    'Value': [1, 2, 0.22,3, 4, 5,-0.21, 6],
    'TT': ['Xe', 'Ya', 'Xe','Xe', 'Ye', 'Xa', 'Yu','Ya']
})

# 按 'Category' 和 'Type' 列分组，然后计算每组的总和
grouped = df.groupby(['Category', 'Type']).sum()

grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Value,TT
Category,Type,Unnamed: 2_level_1,Unnamed: 3_level_1
A,X,1.22,XeXe
A,Y,1.79,YaYu
B,X,3.0,Xe
B,Y,4.0,Ye
C,X,5.0,Xa
C,Y,6.0,Ya


In [17]:
df

Unnamed: 0,Category,Type,Value,TT
0,A,X,1.0,Xe
1,A,Y,2.0,Ya
2,A,X,0.22,Xe
3,B,X,3.0,Xe
4,B,Y,4.0,Ye
5,C,X,5.0,Xa
6,A,Y,-0.21,Yu
7,C,Y,6.0,Ya


In [19]:
 df.groupby(['Category', 'Type']).first()

Unnamed: 0_level_0,Unnamed: 1_level_0,Value,TT
Category,Type,Unnamed: 2_level_1,Unnamed: 3_level_1
A,X,1.0,Xe
A,Y,2.0,Ya
B,X,3.0,Xe
B,Y,4.0,Ye
C,X,5.0,Xa
C,Y,6.0,Ya


In [33]:
 df.groupby(['Category', 'Type'])['TT'].first().unstack()

Type,X,Y
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
A,Xe,Ya
B,Xe,Ye
C,Xa,Ya


In [24]:
d = {'num_legs': [4, 4, 2, 2],
'class': ['mammal', 'mammal', 'mammal', 'bird'],
 'animal': ['cat', 'dog', 'bat', 'penguin'],
'locomotion': ['walks', 'walks', 'flies', 'walks']}
df = pd.DataFrame(data=d)
df = df.set_index(['class', 'animal', 'locomotion'])

df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,num_legs
class,animal,locomotion,Unnamed: 3_level_1
mammal,cat,walks,4
mammal,dog,walks,4
mammal,bat,flies,2
bird,penguin,walks,2
