# Python Data Analysis Library - Pandas
## Pandas
: 구조화된 데이터의 처리를 지원하는 Python Library, Python계의 excel
- Panel data -> pandas (Tabular : table type data)
- 고성능 array 계산 library인 numpy와 통합하여 강력한 스프레드시트 처리 기능 제공
- indexing, 연산용 함수, 전처리 함수 등 제공
- data 처리 및 통계 분석을 위해 사용
- Excel 맨위 : attribute, field, feature, column
- Excel data table, Sample
- Row : instance, tuple, row
- Column : Feature vector
- one cell : data
### - Pandas 설치
conda create -n ml python = 3.10 #가상환경 생성 <br>
activate ml #가상환경실행 <br>
conda install pandas # pandas 설치 <br>
jupyter notebook # 주피터 실행하기

## 1. Pandas Overview
### 1.1 데이터 로딩

In [1]:
import pandas as pd

In [5]:
# Data URL
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
# Seperation (데이터 나누는 기준)
# sep = '|s+' -> regular exparation /  s : single blank/space 띄어쓰기 한거, + : 여러개가 있다
# CSV type data load, seperate는 빈공간으로 지정, Column은 없음
df_data = pd.read_csv(data_url, sep = "\s+", header = None)

In [6]:
# head() : 처음 다섯줄 출력
df_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [7]:
# Column Header 이름 지정
df_data.columns = ["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT","MEDV"]
df_data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [8]:
df_data.values

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 3.9690e+02, 4.9800e+00,
        2.4000e+01],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 3.9690e+02, 9.1400e+00,
        2.1600e+01],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 3.9283e+02, 4.0300e+00,
        3.4700e+01],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 3.9690e+02, 5.6400e+00,
        2.3900e+01],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 3.9345e+02, 6.4800e+00,
        2.2000e+01],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 3.9690e+02, 7.8800e+00,
        1.1900e+01]])

## 2. Series
### 2.1 Pandas의 구성
- Series : DataFrame 중 하나의 Column에 해당하는 Data의 모음 object
- DataFrame : DataTable 전체를 포함하는 object
### 2.2 일반적인 pandas 활용
- 위와 같이 기존 데이터를 불러와서 DataFrame 생성
### 2.3 Series
- Column Vector를 표현하는 object
- Subclass of numpy.ndarray
- Data : any type
- Index labels need not be ordered
- Duplicates are possible (but result in reduced functionality)

In [9]:
from pandas import Series, DataFrame
import pandas as pd 

In [15]:
list_data = [1,2,3,4,5]
name = ["a","b","c","d","e"]
# index 지정 안해주면 index default 값 나옴
example_obj = Series(data = list_data)
example_obj

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [25]:
list_data = [1,2,3,4,5]
name = ["a","b","c","d","e"]
# index 이름 지정
example_obj = Series(data = list_data, index = name)
example_obj

a    1
b    2
c    3
d    4
e    5
dtype: int64

- Index List만 출력

In [26]:
example_obj.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

- 값 List만 출력

In [27]:
example_obj.values

array([1, 2, 3, 4, 5], dtype=int64)

- Data에 대한 Information 저장

In [28]:
example_obj.name = "number"
example_obj.index.name = "alphabet"
example_obj

alphabet
a    1
b    2
c    3
d    4
e    5
Name: number, dtype: int64

In [19]:
type(example_obj.values)

numpy.ndarray

In [11]:
import numpy as np

In [20]:
dict_data = {"a":1, "b":2, "c":3, "d":4, "e":5}
# data type설정 ,Series 이름 설정
example_obj = Series(dict_data, dtype = np.float32, name = "example_data")
example_obj 

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

- data index에 접근하기

In [21]:
example_obj["a"]

1.0

- data index에 값 할당하기

In [22]:
example_obj["a"]=3.2
example_obj

a    3.2
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

float의 default is float64

In [23]:
example_obj = example_obj.astype(float)
# float 32 -> 64로 변경됨
example_obj

a    3.2
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float64

In [24]:
example_obj = example_obj.astype(int)
# float 64 -> int 64로 변경됨
example_obj

a    3
b    2
c    3
d    4
e    5
Name: example_data, dtype: int32

- Index 값을 기준으로 Series 생성 => 인덱스가 있으면 Value가 없어도 생성됨

In [29]:
dict_data_1 = {"a":1, "b":2, "c":3, "d":4, "e":5}
indexes = ["a","b","c","d","e","f","g","h"]
series_obj_1 = Series(dict_data_1, index = indexes)
series_obj_1

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
f    NaN
g    NaN
h    NaN
dtype: float64

## 3. DataFrame
### 3.1 DataFrame Memory
- Numpy array-like
- Each column can have a different type
- Row and column index
- Size mutable : insert and delete columns

### 3.2 DataFrame Overview
- Series를 모아서 만든 Data Table = 기본 2차원
- key 값이 : column에
### 3.3 DataFrame 생성

In [30]:
raw_data = {"first_name" : ["Jason", "Molly", "Tina", "Jake", "Amy"],
"last_name" : ["Miller", "Jacobson", "Ali", "Milner", "Cooze"],
"age" : [42, 52, 36, 24, 73],
"city" : ["San Francisco", "Baltimore", "Miami", "Douglas", "Boston"]
}
df = pd.DataFrame(raw_data, columns=["first_name","last_name","age","city"])
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami
3,Jake,Milner,24,Douglas
4,Amy,Cooze,73,Boston


In [32]:
# dict type
raw_data

{'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
 'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
 'age': [42, 52, 36, 24, 73],
 'city': ['San Francisco', 'Baltimore', 'Miami', 'Douglas', 'Boston']}

- column 선택

In [33]:
DataFrame(raw_data, columns=["age","city"])

Unnamed: 0,age,city
0,42,San Francisco
1,52,Baltimore
2,36,Miami
3,24,Douglas
4,73,Boston


- 새로운 column 추가

In [34]:
DataFrame(raw_data, columns=["first_name","last_name","age","city","debt"])

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
1,Molly,Jacobson,52,Baltimore,
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,
4,Amy,Cooze,73,Boston,


- column 선택 -> Series 추출

In [35]:
df = DataFrame(raw_data, columns=["first_name","last_name","age","city","debt"])
df.first_name

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

In [60]:
# Series type
df["first_name"]

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

In [61]:
type(df["first_name"])

pandas.core.series.Series

In [64]:
# DataFrame으로 출력
df[["first_name"]]

Unnamed: 0,first_name
0,Jason
1,Molly
2,Tina
3,Jake
4,Amy


### 3.4 DataFrame Indexing

In [38]:
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
1,Molly,Jacobson,52,Baltimore,
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,
4,Amy,Cooze,73,Boston,


- loc : Index Location

In [39]:
df.loc[1]

first_name        Molly
last_name      Jacobson
age                  52
city          Baltimore
debt                NaN
Name: 1, dtype: object

- iloc : index position

In [40]:
df["age"].iloc[1:]

1    52
2    36
3    24
4    73
Name: age, dtype: int64

In [44]:
df.loc[:,["last_name"]]

Unnamed: 0,last_name
0,Miller
1,Jacobson
2,Ali
3,Milner
4,Cooze


- loc은 index 이름
- iloc은 index number

In [41]:
s = pd.Series(np.nan, index = [49,48,47,46,45,1,2,3,4,5])
s.loc[:3]

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
dtype: float64

In [43]:
# 3개까지만 추출
s.iloc[:3]

49   NaN
48   NaN
47   NaN
dtype: float64

### 3.5 DataFrame Handling
- column에 새로운 data 할당

In [45]:
df.debt = df.age > 40
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,True
1,Molly,Jacobson,52,Baltimore,True
2,Tina,Ali,36,Miami,False
3,Jake,Milner,24,Douglas,False
4,Amy,Cooze,73,Boston,True


In [46]:
values = Series(data = ["M","F","F"], index = [0,1,3])
values

0    M
1    F
3    F
dtype: object

In [47]:
# index가 0,1,3에 해당하는 data에만 할당, 아닌 곳은 Nan
df["sex"] = values
df

Unnamed: 0,first_name,last_name,age,city,debt,sex
0,Jason,Miller,42,San Francisco,True,M
1,Molly,Jacobson,52,Baltimore,True,F
2,Tina,Ali,36,Miami,False,
3,Jake,Milner,24,Douglas,False,F
4,Amy,Cooze,73,Boston,True,


- Transpose

In [48]:
df.T

Unnamed: 0,0,1,2,3,4
first_name,Jason,Molly,Tina,Jake,Amy
last_name,Miller,Jacobson,Ali,Milner,Cooze
age,42,52,36,24,73
city,San Francisco,Baltimore,Miami,Douglas,Boston
debt,True,True,False,False,True
sex,M,F,,F,


- All 값 출력

In [49]:
df.values

array([['Jason', 'Miller', 42, 'San Francisco', True, 'M'],
       ['Molly', 'Jacobson', 52, 'Baltimore', True, 'F'],
       ['Tina', 'Ali', 36, 'Miami', False, nan],
       ['Jake', 'Milner', 24, 'Douglas', False, 'F'],
       ['Amy', 'Cooze', 73, 'Boston', True, nan]], dtype=object)

- index값 출력

In [51]:
df.index

RangeIndex(start=0, stop=5, step=1)

- CSV 변환

In [50]:
df.to_csv()

',first_name,last_name,age,city,debt,sex\r\n0,Jason,Miller,42,San Francisco,True,M\r\n1,Molly,Jacobson,52,Baltimore,True,F\r\n2,Tina,Ali,36,Miami,False,\r\n3,Jake,Milner,24,Douglas,False,F\r\n4,Amy,Cooze,73,Boston,True,\r\n'

- column 삭제
    - del df["column name"]
    - df.drop("column name", axis =1)  -> df 자체는 변하지 않음

In [55]:
del df["debt"]
df

Unnamed: 0,first_name,last_name,age,city,sex
0,Jason,Miller,42,San Francisco,M
1,Molly,Jacobson,52,Baltimore,F
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,F
4,Amy,Cooze,73,Boston,


In [56]:
pop = {"Nevada" : {2001:2.4, 2002:2.9}, "Ohio": {2000:1.5, 2001:1.7, 2002:3.6}}
DataFrame(pop)

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5
