In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import IPython.display 

### Cheat sheet

In [8]:
IPython.display.IFrame("https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf" ,width="800", height="600")

### Series

#### Creating series from array

In [12]:

ser = pd.Series()
print("Pandas Series: ", ser)

# simple array
data = np.array(['g', 'e', 'e', 'k', 's'])
	
ser = pd.Series(data)
print("Pandas Series:\n", ser)

Pandas Series:  Series([], dtype: object)
Pandas Series:
 0    g
1    e
2    e
3    k
4    s
dtype: object


In [56]:
ser.index , ser.values   # index -- row labels (default) or row labels index

(RangeIndex(start=0, stop=5, step=1),
 array(['g', 'e', 'e', 'k', 's'], dtype=object))

#### Create series from dic

In [55]:

zip_codes ={"100":"中正區","103":"大同區","104":"中山區","105":"松山區","106":"大安區","108":"萬華區","110":"信義區","111":"士林區","112":"北投區","114":"內湖區","115":"南港區","116":"文山區" } 
code = pd.Series(zip_codes)
code

100    中正區
103    大同區
104    中山區
105    松山區
106    大安區
108    萬華區
110    信義區
111    士林區
112    北投區
114    內湖區
115    南港區
116    文山區
dtype: object

In [15]:
code.index , code.values

(Index(['100', '103', '104', '105', '106', '108', '110', '111', '112', '114',
        '115', '116'],
       dtype='object'),
 array(['中正區', '大同區', '中山區', '松山區', '大安區', '萬華區', '信義區', '士林區', '北投區',
        '內湖區', '南港區', '文山區'], dtype=object))

In [16]:
code[2] , code['104'] , code[2] == code['104'] # acess via label or index

('中山區', '中山區', True)

In [17]:
code[2:8] , code[7]

(104    中山區
 105    松山區
 106    大安區
 108    萬華區
 110    信義區
 111    士林區
 dtype: object,
 '士林區')

In [18]:
code.head(-1) , code.head(2)

(100    中正區
 103    大同區
 104    中山區
 105    松山區
 106    大安區
 108    萬華區
 110    信義區
 111    士林區
 112    北投區
 114    內湖區
 115    南港區
 dtype: object,
 100    中正區
 103    大同區
 dtype: object)

#### Series operation

In [19]:
index = {"peko" , "miko" , "godtone" , "kuuga"}
data_s1 = pd.Series( [1,0.2,"r",1138918238] , index)
data_s2 = pd.Series( [1234,0.2,3.240201,21.12459] , index)

data_s1+ data_s1  , "------------",data_s2-data_s2 ,"---------------" , data_s2 *data_s2

(godtone             2
 peko              0.4
 kuuga              rr
 miko       2277836476
 dtype: object,
 '------------',
 godtone    0.0
 peko       0.0
 kuuga      0.0
 miko       0.0
 dtype: float64,
 '---------------',
 godtone    1.522756e+06
 peko       4.000000e-02
 kuuga      1.049890e+01
 miko       4.462483e+02
 dtype: float64)

One can refer to https://www.geeksforgeeks.org/python-pandas-series/?ref=lbp for more operations

#### Misc property

In [20]:
#change to list
data_s1.tolist()

[1, 0.2, 'r', 1138918238]

In [21]:
#data is mutable
data_s1["peko"]=1
data_s1["peko"]

1

### Data frame

In [22]:
# Calling DataFrame constructor
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


In [43]:
# DataFrame from dict narray / lists
# initialise data of lists.
data_dict ={'name':["peko", "miko", "suiesei", "godtone"],
        'age': [110, 35, 4, 50],
        'weight':[90, 40, 80, 87]}
# Create DataFrame
df = pd.DataFrame(data_dict)
# Print the output.
df
# index--row labels , key--column labels , values -- talbe element 

Unnamed: 0,name,age,weight
0,peko,110,90
1,miko,35,40
2,suiesei,4,80
3,godtone,50,87


In [54]:
df.keys()  , df.index

(Index(['name', 'age', 'weight'], dtype='object'),
 RangeIndex(start=0, stop=4, step=1))

In [25]:
pd.Series(data_dict)

name      [peko, miko, suiesei, godtone]
age                     [110, 35, 4, 50]
weight                  [90, 40, 80, 87]
dtype: object

In [26]:
print( "its data type is" , type(df['name']))
df['name'] 

its data type is <class 'pandas.core.series.Series'>


0       peko
1       miko
2    suiesei
3    godtone
Name: name, dtype: object

In [51]:
df[['weight','name']] #this is a (sub)dataframe

Unnamed: 0,weight,name
0,90,peko
1,40,miko
2,80,suiesei
3,87,godtone


In [47]:
df.describe()

Unnamed: 0,age,weight
count,4.0,4.0
mean,49.75,74.25
std,44.5,23.214579
min,4.0,40.0
25%,27.25,70.0
50%,42.5,83.5
75%,65.0,87.75
max,110.0,90.0


In [48]:
df["age"].describe()

count      4.00
mean      49.75
std       44.50
min        4.00
25%       27.25
50%       42.50
75%       65.00
max      110.00
Name: age, dtype: float64

### CSV example

In [71]:
# making data frame
data = pd.read_csv("https://media.geeksforgeeks.org/wp-content/uploads/nba.csv")
 #  set index_col ="Name " or other keys (columns) to set the indices to values of that key (column)  

# calling head() method
# storing in new variable
data_top = data.head()
data_end = data.tail()
# display
data_top


Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [72]:
data_end

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [73]:
data.describe()

Unnamed: 0,Number,Age,Weight,Salary
count,457.0,457.0,457.0,446.0
mean,17.678337,26.938731,221.522976,4842684.0
std,15.96609,4.404016,26.368343,5229238.0
min,0.0,19.0,161.0,30888.0
25%,5.0,24.0,200.0,1044792.0
50%,13.0,26.0,220.0,2839073.0
75%,25.0,30.0,240.0,6500000.0
max,99.0,40.0,307.0,25000000.0


### Acess row elements (.loc , .iloc)


#### loc

In [3]:
help(pd.DataFrame.loc)

Help on property:

    Access a group of rows and columns by label(s) or a boolean array.
    
    ``.loc[]`` is primarily label based, but may also be used with a
    boolean array.
    
    Allowed inputs are:
    
    - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is
      interpreted as a *label* of the index, and **never** as an
      integer position along the index).
    - A list or array of labels, e.g. ``['a', 'b', 'c']``.
    - A slice object with labels, e.g. ``'a':'f'``.
    
          start and the stop are included
    
    - A boolean array of the same length as the axis being sliced,
      e.g. ``[True, False, True]``.
    - An alignable boolean Series. The index of the key will be aligned before
      masking.
    - An alignable Index. The Index of the returned selection will be the input.
    - A ``callable`` function with one argument (the calling Series or
      DataFrame) and that returns valid output for indexing (one of the above)
    
    See more at 

In [93]:

data.loc[0 , "Name" ] , "-----" , data.loc[0 , ["Name" , "College"]]  , "-----", data.loc[0]
#acess a row elements or the row

('Avery Bradley',
 '-----',
 Name       Avery Bradley
 College            Texas
 Name: 0, dtype: object,
 '-----',
 Name         Avery Bradley
 Team        Boston Celtics
 Number                 0.0
 Position                PG
 Age                   25.0
 Height                 6-2
 Weight               180.0
 College              Texas
 Salary           7730337.0
 Name: 0, dtype: object)

# 待補