# 자료형 다루기
## 자료형 변환하기

In [1]:
import pandas as pd
import seaborn as sns

tips = sns.load_dataset("tips")

print(tips.shape)
print(tips.columns)

(244, 7)
Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')


- astype

In [3]:
#흡연여부 데이터를 문자열로 변환하여 저장
tips['smoker_str'] = tips['smoker'].astype(str)
print(tips.dtypes)

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
smoker_str      object
dtype: object


In [5]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,smoker_str
0,16.99,1.01,Female,No,Sun,Dinner,2,No
1,10.34,1.66,Male,No,Sun,Dinner,3,No
2,21.01,3.5,Male,No,Sun,Dinner,3,No
3,23.68,3.31,Male,No,Sun,Dinner,2,No
4,24.59,3.61,Female,No,Sun,Dinner,4,No


In [11]:
#전체 금액 데이터를 문자열로 변환
tips['total_bill'] = tips['total_bill'].astype(str)
tips.dtypes

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
smoker_str      object
dtype: object

In [13]:
#전체 금액 데이터를 실수로 변환
tips['total_bill'] = tips['total_bill'].astype(float)
tips.dtypes

#category로 변환할때는 'category'

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
smoker_str      object
dtype: object

- 잘못 입력한 데이터 처리하기
    - 숫자형태의 데이터에 문자열을 입력하면 object 형태로 자동 변경됨
    - astype() 으로 숫자 변환 시에는 문자 데이터 처리가 불가능함

In [17]:
tips.loc[[1,3,5,7],['total bill']] = 'missing'

In [19]:
tips.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
smoker_str      object
total bill      object
dtype: object

- to_numeric
    - raise, coerce, ignore를 지정하여 제어 가능

In [23]:
tips_sub_miss = tips.head(10)
tips_sub_miss.loc[[1,3,5,7], 'total_bill'] = "missing"

tips_sub_miss.dtypes
tips_sub_miss

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tips_sub_miss.loc[[1,3,5,7], 'total_bill'] = "missing"


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,smoker_str,total bill
0,16.99,1.01,Female,No,Sun,Dinner,2,No,
1,missing,1.66,Male,No,Sun,Dinner,3,No,missing
2,21.01,3.5,Male,No,Sun,Dinner,3,No,
3,missing,3.31,Male,No,Sun,Dinner,2,No,missing
4,24.59,3.61,Female,No,Sun,Dinner,4,No,
5,missing,4.71,Male,No,Sun,Dinner,4,No,missing
6,8.77,2.0,Male,No,Sun,Dinner,2,No,
7,missing,3.12,Male,No,Sun,Dinner,4,No,missing
8,15.04,1.96,Male,No,Sun,Dinner,2,No,
9,14.78,3.23,Male,No,Sun,Dinner,2,No,


In [27]:
tips = sns.load_dataset('tips')
tips['total_bill'] = tips['total_bill'].astype(str)
tips.dtypes

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [29]:
tips['total_bill'] = pd.to_numeric(tips['total_bill'])
tips.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [31]:
pd.to_numeric(tips_sub_miss['total_bill'], errors='coerce')

0    16.99
1      NaN
2    21.01
3      NaN
4    24.59
5      NaN
6     8.77
7      NaN
8    15.04
9    14.78
Name: total_bill, dtype: float64

In [33]:
pd.to_numeric(tips_sub_miss['total_bill'], errors='ignore')

0      16.99
1    missing
2      21.01
3    missing
4      24.59
5    missing
6       8.77
7    missing
8      15.04
9      14.78
Name: total_bill, dtype: object

## 카테고리 자료형
- 용량과 속도 면에서 매우 효율적
주로 동일한 문자열이 반복되어 데이터를 구성하는 경우에 사용

In [35]:
tips['smoker'] = tips['smoker'].astype('str')
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    object  
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(3), float64(2), int64(1), object(1)
memory usage: 8.9+ KB


In [36]:
tips['smoker'] = tips['smoker'].astype('category')
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB
