## Pandas

In [1]:
# Pandas is developed based on numpy
# Pandas is better to analyze and manipulate the dataset..

### How to generate data using Pandas

In [2]:
import pandas as pd

In [3]:
## Series
# pd.Series(sequence data)

In [4]:
s1 = pd.Series([10,20,30,40,50]) # 괄호 안에 [] 쓰고 숫자 쓰기 

In [5]:
s1 # index /value

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [6]:
s1.index

RangeIndex(start=0, stop=5, step=1)

In [7]:
s1.values

array([10, 20, 30, 40, 50], dtype=int64)

In [11]:
# numpy.array can only include the data of same type: numpy.array는 하나의 데이터 타입만 저장될 수 있는데 
# But in the pandas can generate the mix type data: pandas는 여러 데이터 타입을 저장할 수 있다. 

In [8]:
s2 = pd.Series(["a",'b','c',1,2,3]) # 여러 데이터 타입 저장 가능 

In [9]:
s2

0    a
1    b
2    c
3    1
4    2
5    3
dtype: object

In [12]:
s2.index

RangeIndex(start=0, stop=6, step=1)

In [13]:
s2.values # value아니고 values 

array(['a', 'b', 'c', 1, 2, 3], dtype=object)

### missing value : 결측값 

In [15]:
import numpy as np
s3 = pd.Series([np.nan,10,30])

In [16]:
s3

0     NaN
1    10.0
2    30.0
dtype: float64

----- sequence data in pandas  -----

## DataFrame(2-dimensional data)

In [18]:
# df=pd.DataFrame(data, index=index_data, columns=columns_data)
# index, columns, values

In [8]:
import pandas as pd

In [9]:
df=pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]])

In [10]:
df

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [22]:
df.values

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]], dtype=int64)

In [23]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [24]:
df.columns

RangeIndex(start=0, stop=3, step=1)

In [29]:
# ndarray
np.arange(0,9).reshape(3,3)

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [30]:
type(np.arange(0,9).reshape(3,3))

numpy.ndarray

In [28]:
# pandas
pd.DataFrame(np.arange(0,9).reshape(3,3)) # pd.DataFrame 으로 만들기 

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8


In [31]:
type(pd.DataFrame(np.arange(0,9).reshape(3,3)))

pandas.core.frame.DataFrame

In [32]:
# you can change the name of index and columns

In [36]:
pd.DataFrame(np.arange(0,9).reshape(3,3),index=["a",'b','c'],columns=['x','y','z'])
# index 와 columns로 값을 지정할 수 있다. 

Unnamed: 0,x,y,z
a,0,1,2
b,3,4,5
c,6,7,8


In [37]:
# list, ndarray -> pandas 
# dictionary -> pandas

In [42]:
table={'year':[2015,2016,2017,2018],
       'sales':[200,250,450,300],}

In [43]:
type(table)

dict

In [45]:
pd.DataFrame(table)
# key(year,sales) goes to the columns 

Unnamed: 0,year,sales
0,2015,200
1,2016,250
2,2017,450
3,2018,300


### operation

In [46]:
s1 = pd.Series([1,2,3,4,5])
s2=pd.Series([10,20,30,40,50])

In [47]:
s1+s2

0    11
1    22
2    33
3    44
4    55
dtype: int64

In [48]:
s1-s2

0    -9
1   -18
2   -27
3   -36
4   -45
dtype: int64

In [49]:
s1/s2

0    0.1
1    0.1
2    0.1
3    0.1
4    0.1
dtype: float64

In [50]:
table_2={"A":[1,2,3],
        "B":[10,20,30],
        "C":[100,200,300]} # dictionary 만들 때 형태 잘 보기 

In [54]:
df2=pd.DataFrame(table_2)

In [52]:
table_3={"A":[6,7],
        "B":[60,70],
        "C":[600,700]}

In [55]:
df3=pd.DataFrame(table_3)

In [56]:
df2.shape

(3, 3)

In [57]:
df3.shape

(2, 3)

In [58]:
df2+df3

Unnamed: 0,A,B,C
0,7.0,70.0,700.0
1,9.0,90.0,900.0
2,,,


In [59]:
df2-df3

Unnamed: 0,A,B,C
0,-5.0,-50.0,-500.0
1,-5.0,-50.0,-500.0
2,,,


In [60]:
df2/df3

Unnamed: 0,A,B,C
0,0.166667,0.166667,0.166667
1,0.285714,0.285714,0.285714
2,,,


In [61]:
# shape이 맞지 않으면 missing value 를 생성한다. 

### descriptive statistics

In [3]:
import pandas as pd

In [11]:
table_4={"Spring":[256,264,215,312],
        "Summer":[770,567,599,387],
        "Fall":[363,231,293,247],
        "Winter":[139,59,76,109]}

In [12]:
df_5=pd.DataFrame(table_4,index=['2012','2013','2014','2015'])
# precipitation(강수량)

In [13]:
df_5

Unnamed: 0,Spring,Summer,Fall,Winter
2012,256,770,363,139
2013,264,567,231,59
2014,215,599,293,76
2015,312,387,247,109


In [14]:
# sum for each column

df_5.sum()

Spring    1047
Summer    2323
Fall      1134
Winter     383
dtype: int64

In [17]:
# sum for each index
df_5.sum(axis=1) 

2012    1528
2013    1121
2014    1183
2015    1055
dtype: int64

In [18]:
# mean for each index
df_5.mean(axis=0)

Spring    261.75
Summer    580.75
Fall      283.50
Winter     95.75
dtype: float64

In [19]:
# std for each index
df_5.std(axis=1)

2012    274.365936
2013    211.239793
2014    221.192789
2015    117.964331
dtype: float64

In [20]:
# min for each index
df_5.min(axis=1)

2012    139
2013     59
2014     76
2015    109
dtype: int64

In [21]:
# max for each index
df_5.max(axis=1)

2012    770
2013    567
2014    599
2015    387
dtype: int64

In [76]:
# descriptive statistics
df_5.describe() # 기술통계량 구하기 

Unnamed: 0,Spring,Summer,Fall,Winter
count,4.0,4.0,4.0,4.0
mean,261.75,580.75,283.5,95.75
std,39.785885,156.925407,59.157981,35.528158
min,215.0,387.0,231.0,59.0
25%,245.75,522.0,243.0,71.75
50%,260.0,583.0,270.0,92.5
75%,276.0,641.75,310.5,116.5
max,312.0,770.0,363.0,139.0


In [77]:
type(df_5.describe())

pandas.core.frame.DataFrame

In [78]:
df_5.describe().shape

(8, 4)

## 팀 프로젝트

1) project : down load from github
    1. interpretation of a preexisting code // line by line
    2. modify(수정) the code 
    -> summarize the tasks in a document upload
github code: https://github.com/eduardosm7/knn-python/blob/master/Classification/survival-knn.ipynb