# **CH07** 깔끔한 데이터

## 2. 열 이름 관리하기

In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
ebola = pd.read_csv('../data/country_timeseries.csv')
ebola.head()

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
3,1/2/2015,286,,8157.0,,,,,,,,3496.0,,,,,,
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,


In [5]:
ebola.columns

Index(['Date', 'Day', 'Cases_Guinea', 'Cases_Liberia', 'Cases_SierraLeone',
       'Cases_Nigeria', 'Cases_Senegal', 'Cases_UnitedStates', 'Cases_Spain',
       'Cases_Mali', 'Deaths_Guinea', 'Deaths_Liberia', 'Deaths_SierraLeone',
       'Deaths_Nigeria', 'Deaths_Senegal', 'Deaths_UnitedStates',
       'Deaths_Spain', 'Deaths_Mali'],
      dtype='object')

In [6]:
ebola_long = pd.melt(ebola, id_vars=['Date','Day'])
ebola_long.head()

Unnamed: 0,Date,Day,variable,value
0,1/5/2015,289,Cases_Guinea,2776.0
1,1/4/2015,288,Cases_Guinea,2775.0
2,1/3/2015,287,Cases_Guinea,2769.0
3,1/2/2015,286,Cases_Guinea,
4,12/31/2014,284,Cases_Guinea,2730.0


In [7]:
ebola_long.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1952 entries, 0 to 1951
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      1952 non-null   object 
 1   Day       1952 non-null   int64  
 2   variable  1952 non-null   object 
 3   value     738 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 61.1+ KB


### split 메서드로 열 이름 분리하기

In [8]:
ebola_long.variable

0       Cases_Guinea
1       Cases_Guinea
2       Cases_Guinea
3       Cases_Guinea
4       Cases_Guinea
            ...     
1947     Deaths_Mali
1948     Deaths_Mali
1949     Deaths_Mali
1950     Deaths_Mali
1951     Deaths_Mali
Name: variable, Length: 1952, dtype: object

In [9]:
ebola_long.variable.str

<pandas.core.strings.accessor.StringMethods at 0x216425293c0>

In [10]:
# ndarray=> object type으로 저장되어있기에 str로 변환 필요
#                 DataFrame.Column명
variable_split = ebola_long.variable.str.split('_')
# split return list

variable_split[:5]

0    [Cases, Guinea]
1    [Cases, Guinea]
2    [Cases, Guinea]
3    [Cases, Guinea]
4    [Cases, Guinea]
Name: variable, dtype: object

In [11]:
type(variable_split)

pandas.core.series.Series

In [12]:
type(variable_split[0])

list

#### 분리한 Series 기존 DataFrame에 추가하기

In [13]:
# ndarray=> object type으로 저장되어있기에 str로 변환 필요

status_values = variable_split.str.get(0)
country_values = variable_split.str.get(1)

status_values[:5], country_values[:5]

(0    Cases
 1    Cases
 2    Cases
 3    Cases
 4    Cases
 Name: variable, dtype: object,
 0    Guinea
 1    Guinea
 2    Guinea
 3    Guinea
 4    Guinea
 Name: variable, dtype: object)

In [14]:
status_values.value_counts(),country_values.value_counts()

(variable
 Cases     976
 Deaths    976
 Name: count, dtype: int64,
 variable
 Guinea          244
 Liberia         244
 SierraLeone     244
 Nigeria         244
 Senegal         244
 UnitedStates    244
 Spain           244
 Mali            244
 Name: count, dtype: int64)

In [15]:
ebola_long['status'] = status_values
ebola_long['country'] = country_values

ebola_long.head()

Unnamed: 0,Date,Day,variable,value,status,country
0,1/5/2015,289,Cases_Guinea,2776.0,Cases,Guinea
1,1/4/2015,288,Cases_Guinea,2775.0,Cases,Guinea
2,1/3/2015,287,Cases_Guinea,2769.0,Cases,Guinea
3,1/2/2015,286,Cases_Guinea,,Cases,Guinea
4,12/31/2014,284,Cases_Guinea,2730.0,Cases,Guinea


#### split method로 열 이름 분리하기 및</n> 분리한 Series 기존 Dataframe에 추가하기 
 (concat 사용)

##### ***split('',expand=True)***

In [16]:
variable_split_ex = ebola_long.variable.str.split('_', expand=True) 
# =>expand=True : return DataFrame

In [17]:
variable_split_ex.head()

Unnamed: 0,0,1
0,Cases,Guinea
1,Cases,Guinea
2,Cases,Guinea
3,Cases,Guinea
4,Cases,Guinea


In [18]:
variable_split_ex.columns = ['status','country']
ebola_parsed = pd.concat([ebola_long, variable_split_ex],axis=1)

In [19]:
ebola_parsed.head()

Unnamed: 0,Date,Day,variable,value,status,country,status.1,country.1
0,1/5/2015,289,Cases_Guinea,2776.0,Cases,Guinea,Cases,Guinea
1,1/4/2015,288,Cases_Guinea,2775.0,Cases,Guinea,Cases,Guinea
2,1/3/2015,287,Cases_Guinea,2769.0,Cases,Guinea,Cases,Guinea
3,1/2/2015,286,Cases_Guinea,,Cases,Guinea,Cases,Guinea
4,12/31/2014,284,Cases_Guinea,2730.0,Cases,Guinea,Cases,Guinea


## 3. 여러 열을 하나로 정리하기

#### melt, pivot_table method

In [20]:
weather = pd.read_csv('../data/weather.csv')
weather.head()

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX17004,2010,1,tmax,,,,,,,...,,,,,,,,,27.8,
1,MX17004,2010,1,tmin,,,,,,,...,,,,,,,,,14.5,
2,MX17004,2010,2,tmax,,27.3,24.1,,,,...,,29.9,,,,,,,,
3,MX17004,2010,2,tmin,,14.4,14.4,,,,...,,10.7,,,,,,,,
4,MX17004,2010,3,tmax,,,,,32.1,,...,,,,,,,,,,


In [21]:
weather.columns

Index(['id', 'year', 'month', 'element', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6',
       'd7', 'd8', 'd9', 'd10', 'd11', 'd12', 'd13', 'd14', 'd15', 'd16',
       'd17', 'd18', 'd19', 'd20', 'd21', 'd22', 'd23', 'd24', 'd25', 'd26',
       'd27', 'd28', 'd29', 'd30', 'd31'],
      dtype='object')

##### ***melt()***

In [22]:
weather_melt = pd.melt(weather, 
                       id_vars=['id','year','month','element'],
                       var_name='day',
                       value_name='temp'
)
weather_melt.head()

Unnamed: 0,id,year,month,element,day,temp
0,MX17004,2010,1,tmax,d1,
1,MX17004,2010,1,tmin,d1,
2,MX17004,2010,2,tmax,d1,
3,MX17004,2010,2,tmin,d1,
4,MX17004,2010,3,tmax,d1,


##### ***pivot_table()***

In [None]:
#pd.pivot_table?

In [23]:
weather_tidy = weather_melt.pivot_table(
    index=['id','year','month','day'],  # key 역할
    columns='element',
    values='temp',
    dropna=False
)
weather_tidy

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,element,tmax,tmin
id,year,month,day,Unnamed: 4_level_1,Unnamed: 5_level_1
MX17004,2010,1,d1,,
MX17004,2010,1,d10,,
MX17004,2010,1,d11,,
MX17004,2010,1,d12,,
MX17004,2010,1,d13,,
MX17004,2010,...,...,...,...
MX17004,2010,12,d5,,
MX17004,2010,12,d6,27.8,10.5
MX17004,2010,12,d7,,
MX17004,2010,12,d8,,


In [32]:
weather_tidy.reset_index()
# element : index이름 아님, [id,year,month,day] table명

element,id,year,month,day,tmax,tmin
0,MX17004,2010,1,d1,,
1,MX17004,2010,1,d10,,
2,MX17004,2010,1,d11,,
3,MX17004,2010,1,d12,,
4,MX17004,2010,1,d13,,
...,...,...,...,...,...,...
336,MX17004,2010,12,d5,,
337,MX17004,2010,12,d6,27.8,10.5
338,MX17004,2010,12,d7,,
339,MX17004,2010,12,d8,,


## 4. 중복 데이터 처리하기

In [43]:
billboard = pd.read_csv('../data/billboard.csv')
billboard.columns

Index(['year', 'artist', 'track', 'time', 'date.entered', 'wk1', 'wk2', 'wk3',
       'wk4', 'wk5', 'wk6', 'wk7', 'wk8', 'wk9', 'wk10', 'wk11', 'wk12',
       'wk13', 'wk14', 'wk15', 'wk16', 'wk17', 'wk18', 'wk19', 'wk20', 'wk21',
       'wk22', 'wk23', 'wk24', 'wk25', 'wk26', 'wk27', 'wk28', 'wk29', 'wk30',
       'wk31', 'wk32', 'wk33', 'wk34', 'wk35', 'wk36', 'wk37', 'wk38', 'wk39',
       'wk40', 'wk41', 'wk42', 'wk43', 'wk44', 'wk45', 'wk46', 'wk47', 'wk48',
       'wk49', 'wk50', 'wk51', 'wk52', 'wk53', 'wk54', 'wk55', 'wk56', 'wk57',
       'wk58', 'wk59', 'wk60', 'wk61', 'wk62', 'wk63', 'wk64', 'wk65', 'wk66',
       'wk67', 'wk68', 'wk69', 'wk70', 'wk71', 'wk72', 'wk73', 'wk74', 'wk75',
       'wk76'],
      dtype='object')

In [46]:
billboard.shape

(317, 81)

In [34]:
billboard_long = pd.melt(
      billboard
    , id_vars=['year','artist','track','time','date.entered']
    , var_name='week'
    , value_name='rating'
)
billboard_long.head()

Unnamed: 0,year,artist,track,time,date.entered,week,rating
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk1,87.0
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,wk1,91.0
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,wk1,81.0
3,2000,3 Doors Down,Loser,4:24,2000-10-21,wk1,76.0
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,wk1,57.0


In [35]:
billboard_long.shape

(24092, 7)

In [47]:
#열 단위 추출
billboard_long[billboard_long.track == 'Loser'].shape

(76, 7)

In [48]:
billboard_songs = billboard_long[['year','artist','track','time']]
billboard_songs.shape

(24092, 4)

##### ***drop_duplicates***

In [50]:
billboard_songs = billboard_songs.drop_duplicates()
billboard_songs.shape

(317, 4)

In [51]:
billboard_songs['id'] = range(len(billboard_songs))
billboard_songs.head()

Unnamed: 0,year,artist,track,time,id
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,0
1,2000,2Ge+her,The Hardest Part Of ...,3:15,1
2,2000,3 Doors Down,Kryptonite,3:53,2
3,2000,3 Doors Down,Loser,4:24,3
4,2000,504 Boyz,Wobble Wobble,3:35,4


In [54]:
billboard_rating = billboard_long.merge(
    billboard_songs
    ,on=['year','artist','track','time']
)
billboard_rating.shape

(24092, 8)

In [56]:
billboard_rating.head()

Unnamed: 0,year,artist,track,time,date.entered,week,rating,id
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk1,87.0,0
1,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk2,82.0,0
2,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk3,72.0,0
3,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk4,77.0,0
4,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk5,87.0,0


## 5. 대용량 데이터 처리하기

### 여러개로 나누어진 데이터 불러오기

In [63]:
import os
import urllib.request

In [61]:
# 뉴욕 택시 데이터

# 1번줄 url:https://s3.amazonaws.com/nyc-tlc/trip+data/fhv_tripdata_2015-01.csv

url = 'https://s3.amazonaws.com/nyc-tlc/trip+data/fhv_tripdata_2015-01.csv '
url.split('/')[-1].strip()

'fhv_tripdata_2015-01.csv'

In [67]:
os.path.join('','../data/','fhv_tripdata_2015-01.csv')

'../data/fhv_tripdata_2015-01.csv'

In [229]:
os.path.join?

[1;31mSignature:[0m [0mos[0m[1;33m.[0m[0mpath[0m[1;33m.[0m[0mjoin[0m[1;33m([0m[0mpath[0m[1;33m,[0m [1;33m*[0m[0mpaths[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m <no docstring>
[1;31mFile:[0m      c:\users\user\anaconda3\envs\pandas-dev\lib\ntpath.py
[1;31mType:[0m      function

In [231]:
with open('../data/raw_data_urls.txt','r') as data_urls:
    for line, url in enumerate(data_urls):
        if line == 5:
            break
        fn = url.split('/')[-1].strip()
        fp = os.path.join('..','data',fn)
        print(url)
        print(fp)
        #urllib.request.urlretrieve(url,fp) => access권한 없음~에러

https://s3.amazonaws.com/nyc-tlc/trip+data/fhv_tripdata_2015-01.csv

..\data\fhv_tripdata_2015-01.csv
https://s3.amazonaws.com/nyc-tlc/trip+data/fhv_tripdata_2015-02.csv

..\data\fhv_tripdata_2015-02.csv
https://s3.amazonaws.com/nyc-tlc/trip+data/fhv_tripdata_2015-03.csv

..\data\fhv_tripdata_2015-03.csv
https://s3.amazonaws.com/nyc-tlc/trip+data/fhv_tripdata_2015-04.csv

..\data\fhv_tripdata_2015-04.csv
https://s3.amazonaws.com/nyc-tlc/trip+data/fhv_tripdata_2015-05.csv

..\data\fhv_tripdata_2015-05.csv


In [207]:
import glob

In [218]:
nyc_taxi_data = glob.glob('../data/2015*')
print(nyc_taxi_data)

['../data\\20150427AM_For_Hire_Vehicle_Trip_Data.csv', '../data\\20150427PM_For_Hire_Vehicle_Trip_Data.csv', '../data\\20150428AM_For_Hire_Vehicle_Trip_Data.csv', '../data\\20150428PM_For_Hire_Vehicle_Trip_Data.csv']


In [211]:
taxi1 = pd.read_csv(nyc_taxi_data[0])
taxi1.shape

(48384, 3)

In [233]:
taxi_list = []

for i in range(len(nyc_taxi_data)):
    filename = (nyc_taxi_data[i].split('/')[-1].split('\\')[-1])
    fp = os.path.join('..','data',filename)
    print(fp)
    taxi_list.append(pd.read_csv(fp))
    #taxi_list.append(pd.read_csv('../data/'+filename))
len(taxi_list)

..\data\20150427AM_For_Hire_Vehicle_Trip_Data.csv
..\data\20150427PM_For_Hire_Vehicle_Trip_Data.csv
..\data\20150428AM_For_Hire_Vehicle_Trip_Data.csv
..\data\20150428PM_For_Hire_Vehicle_Trip_Data.csv


4

In [226]:
taxi = pd.concat(taxi_list)

In [227]:
taxi.shape

(245581, 3)

In [212]:
list_taxi_df=[]

for csv_filename in nyc_taxi_data:
    df = pd.read_csv(csv_filename)
    list_taxi_df.append(df)

len(list_taxi_df)

4

In [213]:
type(list_taxi_df[0])

pandas.core.frame.DataFrame

In [214]:
list_taxi_df[0].head()

Unnamed: 0,Dispatching_base_num,Pickup_date,locationID
0,B00009,04/27/2015 12:00:00 AM,
1,B00013,04/27/2015 12:00:00 AM,
2,B00013,04/27/2015 12:00:00 AM,
3,B00221,04/27/2015 12:00:00 AM,243.0
4,B00248,04/27/2015 12:00:00 AM,


In [215]:
taxi_loop_concat = pd.concat(list_taxi_df)
taxi_loop_concat.shape

(245581, 3)

In [228]:
taxi.equals(taxi_loop_concat)

True

---

# **CH08** 판다스 자료형

## 1. 자료형 다루기

In [70]:
tips = sns.load_dataset('tips')

In [74]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
 7   sex_str     244 non-null    object  
dtypes: category(4), float64(2), int64(1), object(1)
memory usage: 9.3+ KB


### 자료형 변환하기

In [71]:
tips['sex_str'] = tips['sex'].astype(str)

In [75]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
 7   sex_str     244 non-null    object  
dtypes: category(4), float64(2), int64(1), object(1)
memory usage: 9.3+ KB


### 잘못입력한 데이터 처리하기

In [79]:
# 샘플 데이터 뽑아두기(원본훼손방지)

tips_sub_miss = tips.head(10)
tips_sub_miss.loc[[1,3,5,7],'total_bill'] = 'missing'
tips_sub_miss.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  10 non-null     object  
 1   tip         10 non-null     float64 
 2   sex         10 non-null     category
 3   smoker      10 non-null     category
 4   day         10 non-null     category
 5   time        10 non-null     category
 6   size        10 non-null     int64   
 7   sex_str     10 non-null     object  
dtypes: category(4), float64(1), int64(1), object(2)
memory usage: 1.0+ KB


**pd.to_numberic**(

        , errors = 'raise' or 'coerce' or 'ignore'
        , downcast = 'integer' or 'signed' or 'unsigned' or 'float' 등
)

errors
- raise  : 숫자로 변환할 수 없는 값이 있으면 오류 발생
- coerce : 숫자로 변환할 수 없는 값을 누락값으로 지정
- ignore : 아무작업도 하지 않음



downcast
- signed   : 부호 있는
- unsigned : 부호 없는

In [81]:
tips_sub_miss['total_bill'] = pd.to_numeric(tips_sub_miss['total_bill'],errors='ignore')
tips_sub_miss.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tips_sub_miss['total_bill'] = pd.to_numeric(tips_sub_miss['total_bill'],errors='ignore')


total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

In [82]:
# 'missing' => NaN 으로 바뀜

tips_sub_miss['total_bill'] = pd.to_numeric(tips_sub_miss['total_bill'],errors='coerce')
tips_sub_miss.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tips_sub_miss['total_bill'] = pd.to_numeric(tips_sub_miss['total_bill'],errors='coerce')


total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

In [83]:
# 다운캐스팅

tips_sub_miss['total_bill'] = pd.to_numeric(
    tips_sub_miss['total_bill']
    ,errors='coerce'
    ,downcast='float'    
)
tips_sub_miss.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tips_sub_miss['total_bill'] = pd.to_numeric(


total_bill     float32
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

## 2. 카테고리 자료형

: 적은 용량, 빠른 속도

-> 주로 범주형 데이터 구성에 사용

### 문자열을 카테고리로 변환하기

In [84]:
tips['sex'] = tips['sex'].astype('str')

0      Female
1        Male
2        Male
3        Male
4      Female
        ...  
239      Male
240    Female
241      Male
242      Male
243    Female
Name: sex, Length: 244, dtype: object

In [85]:
tips.info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
 7   sex_str     244 non-null    object  
dtypes: category(4), float64(2), int64(1), object(1)
memory usage: 9.3+ KB


In [86]:
tips['sex'] = tips['sex'].astype ('category')

In [92]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
 7   sex_str     244 non-null    object  
dtypes: category(4), float64(2), int64(1), object(1)
memory usage: 9.3+ KB


In [105]:
import sys

In [115]:
menus = pd.Series([
        '치즈라면','치즈라면'
        ,'김밥','김밥',
        '치즈라면','치즈라면'
        ,'김밥','김밥',
        '치즈라면','치즈라면'
        ,'김밥','김밥'
])

In [116]:
# 개별 메모리 용량
menus.apply(sys.getsizeof)

0     82
1     82
2     78
3     78
4     82
5     82
6     78
7     78
8     82
9     82
10    78
11    78
dtype: int64

In [117]:
# object 일 때 전체  메모리 용량
menus.memory_usage(index=False, deep=True)

1056

In [118]:
# category일 때 전체 메모리 용량
menus.astype('category').memory_usage(index=False, deep=True)

316

In [119]:
menus.astype('category').values.categories

Index(['김밥', '치즈라면'], dtype='object')

In [120]:
menus.astype('category').values.codes

array([1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0], dtype=int8)

In [234]:
taxi_df = pd.read_csv('../data/2015_For_Hire_Vehicle_Trip_Data.csv')
taxi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63867609 entries, 0 to 63867608
Data columns (total 3 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Dispatching_base_num  object 
 1   Pickup_date           object 
 2   locationID            float64
dtypes: float64(1), object(2)
memory usage: 1.4+ GB


---
# **CH09** 문자열 처리하기

## 1. 문자열 다루기

### 문자열 추출하기

In [122]:
word = 'hello world'

word[0:4]

'hell'

In [125]:
word[4:9]

'o wor'

In [130]:
word[-3:]

'rld'

In [133]:
word[:-1]

'hello worl'

In [131]:
word[:]

'hello world'

In [132]:
word[::2]

'hlowrd'

In [136]:
word[::-1]

'dlrow olleh'

## 2. 문자열 메서드

- capitalize    : 첫문자를 대문자로
- count         : 문자열의 개수
- startwith     : 문자열이 특정 문자로 시작하면 True
- endwith       : 문자열이 특정 문자로 끝나면 True      => 확장자 찾을 때 사용
- find          : 찾을 문자열의 첫번째 인덱스, 없으면 -1
- index         :
- isalpha       : 모든 문자가 알파벳이면 True
- isdecimal     : 모든 문자가 숫자면 True
- isalnum       : 모든 문자가 알파벳이거나 숫자면 True
- lower 
- upper
- replace
- strip         : 맨앞, 맨뒤 공백 제거
- split         : 구분자를 지정하여 문자열 나누고, 리스트 반환
- partition     
- center         
- zfill         : (zerofill) 문자열의 빈칸을 0으로 채움

### 1) join method

In [137]:
d1 = '40'
m1 = '46'
s1 = '52.837'
u1 = 'N'

d2 = '73'
m2 = '58'
s2 = '26.302'
u2 = 'W'

In [139]:
# 한 문자열로
'/'.join([d1,m2,s1,u1])

'40/58/52.837/N'

### 2) splitlines

In [140]:
multi_str = '''hello word
nice to meet you!
where are you from?
have a good day~
'''

In [141]:
multi_str.splitlines()

['hello word', 'nice to meet you!', 'where are you from?', 'have a good day~']

In [142]:
multi_str[::2]

'hlowr\niet etyu\nhr r o rm\naeago a~'

### 3) replace

In [143]:
multi_str.replace('you','')

'hello word\nnice to meet !\nwhere are  from?\nhave a good day~\n'

## 3. 문자열 포매팅

- %
- format
- f

In [145]:
var = 'flesh wound'
s = "It's just a {}"

s.format(var)

"It's just a flesh wound"

In [146]:
print("In 2005, Lu Chao of China recited {:,} digits of pi".format(67890))

In 2005, Lu Chao of China recited 67,890 digits of pi


In [148]:
print("I remember {0:.4} or {0:.4%} of what Lu Chao Recited".format(7/67890))

I remember 0.0001031 or 0.0103% of what Lu Chao Recited


In [151]:
print("My ID number is {0:05d}".format(42))

My ID number is 00042


## 4. 정규식

https://regex101.com


In [152]:
import re

In [165]:
tele_num = '1234567890'

In [166]:
pat = re.compile('\d\d\d\d\d\d\d\d\d\d')
pat.match(tele_num)

<re.Match object; span=(0, 10), match='1234567890'>

In [167]:
m = re.match(pattern='\d\d\d\d\d\d\d\d\d\d', string=tele_num)
m

<re.Match object; span=(0, 10), match='1234567890'>

In [160]:
bool(m)

True

In [181]:
tele_num = '01011112222' #11자리
m = re.search('\d{11}', tele_num)

if m : 
    print(m)
    print(m.start())
    print(m.end())
    print(m.span())
    print(m.group())

else :
    print(m)

<re.Match object; span=(0, 11), match='01011112222'>
0
11
(0, 11)
01011112222


In [185]:
tele_num_spaces = '123 456 7890'
m = re.match(pattern='\d{10}',string=tele_num_spaces)
print(m)    #공백때문에 매칭x

None


In [188]:
p = '\d{3}\s?\d{3}\s?\d{4}'
m = re.match(pattern=p,string=tele_num_spaces)

print(m)

<re.Match object; span=(0, 12), match='123 456 7890'>


In [201]:
tele_num_space_paren_dash = '(123) 456-7890'
#p = '\(?\d{3}\)?\s?\d{3}\s?-?\d{4}'
p = '\(\d{3}\)\s\d{3}\s?-\d{4}'
m = re.match(p, tele_num_space_paren_dash)
print(m)

<re.Match object; span=(0, 14), match='(123) 456-7890'>


In [205]:
cnty_tele_num_space_paren_dash = '+1 (123) 456-7890'

p = '\+1\s\(\d{3}\)\s\d{3}\s?-\d{4}'
m = re.match(p, cnty_tele_num_space_paren_dash)
print(m)

<re.Match object; span=(0, 17), match='+1 (123) 456-7890'>
