In [7]:
import pandas as pd
df = pd.read_csv('../data/gapminder.tsv', sep='\t')
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


In [8]:
# groupby 사용 - year열 기준, lifeExp 열의 편균값을 구함
avg_life_exp_by_year = df.groupby('year').lifeExp.mean()

print(avg_life_exp_by_year)

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64


In [9]:
# groupby 메서드에 열 이름을 전달하면 '분할' 작업이 일어남

year = df.year.unique()
print(year)

[1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 2002 2007]


In [10]:
# 연도별 데이터 추출 예, '반영-1'

y1952 = df.loc[df.year == 1952, :]
print(y1952.head())

        country continent  year  lifeExp       pop    gdpPercap
0   Afghanistan      Asia  1952   28.801   8425333   779.445314
12      Albania    Europe  1952   55.230   1282697  1601.056136
24      Algeria    Africa  1952   43.077   9279525  2449.008185
36       Angola    Africa  1952   30.015   4232095  3520.610273
48    Argentina  Americas  1952   62.485  17876956  5911.315053


In [11]:
# 추출한 데이터의 편균을 구함 '반영-2'

y1952_mean = y1952.lifeExp.mean()
print(y1952_mean)

49.05761971830987


In [12]:
# 연도별 평균값을 구함 - '반영-3'

y1952 = df.loc[df.year == 1952, :]
y1952_mean = y1952.lifeExp.mean()
print(y1952_mean)

y1957 = df.loc[df.year == 1957, :]
y1957_mean = y1957.lifeExp.mean()
print(y1957_mean)

y1962 = df.loc[df.year == 1962, :]
y1962_mean = y1962.lifeExp.mean()
print(y1962_mean)

y2007 = df.loc[df.year == 2007, :]
y2007_mean = y2007.lifeExp.mean()
print(y1957_mean)



49.05761971830987
51.50740112676054
53.60924901408449
51.50740112676054


In [14]:
df2 = pd.DataFrame({'year':[1952,1957,1962,2007],
                   '':[ y1952_mean,y1957_mean, y1962_mean,y1957_mean ]})
df2

Unnamed: 0,year,Unnamed: 2
0,1952,49.05762
1,1957,51.507401
2,1962,53.609249
3,2007,51.507401


# 평균값을 구하는 사용자 함수와 groupby 메서드

In [28]:
def my_mean(values):
    n = len(values)
    #print('n -> %d' %n)
    
    sum = 0 # 그릇 준비 = sum
    
    for value in values: # value =  걍 이름 
        #print('value -> %d' %value)
        sum += value  # sum = sum + value
        #print('sum -> %d' %sum)
        
    return sum / n



In [25]:
my_mean([1,2,4,7]) # 들어온 값의 평균을 구함


n -> 4
value -> 1
sum -> 1
value -> 2
sum -> 3
value -> 4
sum -> 7
value -> 7
sum -> 14


3.5

In [26]:
my_mean([1,2,4,7,8,8])

n -> 6
value -> 1
sum -> 1
value -> 2
sum -> 3
value -> 4
sum -> 7
value -> 7
sum -> 14
value -> 8
sum -> 22
value -> 8
sum -> 30


5.0

In [45]:
# 사용자 정의 함수(my_mean)를 groupby 메서드와 조합하디 위해 agg 메서드 사용

agg_my_mean1 = df.groupby('year').lifeExp.agg(my_mean)
print(agg_my_mean1)

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64


# 두개의 인자값을 받아 처리하는 사용자 함수와 groupby 메서드

In [37]:
# 첫번째 인자로 받은 열의 평균값을 구하여 두번째 인자로 받은 값과 차이를 계산하여 반환

def my_mean_diff(values,diff_value):
    n = len(values)
    #print('n -> %d' %n)
    
    sum = 0 # 그릇 준비 = sum
    
    for value in values: # value =  걍 이름 
        sum += value  # sum = sum + value
    mean = sum / n
        
    return mean - diff_value

    

In [35]:
my_mean_diff([1,2,3,4], 10) # 입력이 2개라서 함수의 받는것도 2개

-7.5

In [39]:
# 전체 평균 수명
global_mean = df.lifeExp.mean()
print(global_mean)

59.47443936619713


In [40]:
agg_mean_diff = df.groupby('year').lifeExp.agg(my_mean_diff, diff_value = global_mean)
print(agg_mean_diff)

year
1952   -10.416820
1957    -7.967038
1962    -5.865190
1967    -3.796150
1972    -1.827053
1977     0.095718
1982     2.058758
1987     3.738173
1992     4.685899
1997     5.540237
2002     6.220483
2007     7.532983
Name: lifeExp, dtype: float64


## 문제. 위의 결과를 알아보기 쉽게 나타내보세요
 * year 연도별 평균 수명 | 전체 평균 수명 | 결과

In [46]:
agg_mean_diff = df.groupby('year').lifeExp.agg(my_mean_diff, diff_value=global_mean) 
a = pd.DataFrame(agg_my_mean1)
a['global_mean'] = 59.47443936619713
b = pd.DataFrame(agg_mean_diff)
result = pd.concat([a, b], axis = 1)
result.columns =['연도별 평균 수명', '전체 평균 수명','결과']
result = result.reset_index()
result


Unnamed: 0,year,연도별 평균 수명,전체 평균 수명,결과
0,1952,49.05762,59.474439,-10.41682
1,1957,51.507401,59.474439,-7.967038
2,1962,53.609249,59.474439,-5.86519
3,1967,55.67829,59.474439,-3.79615
4,1972,57.647386,59.474439,-1.827053
5,1977,59.570157,59.474439,0.095718
6,1982,61.533197,59.474439,2.058758
7,1987,63.212613,59.474439,3.738173
8,1992,64.160338,59.474439,4.685899
9,1997,65.014676,59.474439,5.540237


# 집계 메서드를 리스트, 딕셔너리에 담아 전달하기

In [47]:
# 집계메서드를 리스트에 담아 전달하기
# 연도별로 그룹화 한 lifeExp 열의 0이 아닌 값의 개수, 평균, 표준편차

import numpy as np

gdf = df.groupby('year').lifeExp.agg([np.count_nonzero, np.mean, np.std])
print(gdf)

      count_nonzero       mean        std
year                                     
1952          142.0  49.057620  12.225956
1957          142.0  51.507401  12.231286
1962          142.0  53.609249  12.097245
1967          142.0  55.678290  11.718858
1972          142.0  57.647386  11.381953
1977          142.0  59.570157  11.227229
1982          142.0  61.533197  10.770618
1987          142.0  63.212613  10.556285
1992          142.0  64.160338  11.227380
1997          142.0  65.014676  11.559439
2002          142.0  65.694923  12.279823
2007          142.0  67.007423  12.073021


In [52]:
# 집계 메서드를 딕셔너리에 담아 agg 메서드에 전달
# 평균, 중강값(median)

gdf_dict = df.groupby('year').agg({'lifeExp': 'mean',
                                   'pop': 'median', 
                                   'gdpPercap':'median'})
print(gdf_dict)

        lifeExp         pop    gdpPercap
year                                    
1952  49.057620   3943953.0  1968.528344
1957  51.507401   4282942.0  2173.220291
1962  53.609249   4686039.5  2335.439533
1967  55.678290   5170175.5  2678.334741
1972  57.647386   5877996.5  3339.129407
1977  59.570157   6404036.5  3798.609244
1982  61.533197   7007320.0  4216.228428
1987  63.212613   7774861.5  4280.300366
1992  64.160338   8688686.5  4386.085502
1997  65.014676   9735063.5  4781.825478
2002  65.694923  10372918.5  5319.804524
2007  67.007423  10517531.0  6124.371109


# 표준 점수 계산하기

In [54]:
# 통계 : 표준점수 = 데이터평균 - 표준편차
# 표준점수 - 통계에서 데이터의 평균과 표준편차의 차이를 표준점수

def my_zscore(x):
    return (x - x.mean()) / x.std()

In [56]:
# transform 데이터 변환 메서드
# 데이터와 메서드를 일대일로 대응시켜 계산
# 데이터의 양이 줄어들지 않는다 - 데이터 변환하는데 사용
# 데이터 변환에만 사용 = transform

transform_z = df.groupby('year').lifeExp.transform(my_zscore)
print(transform_z.head())

0   -1.656854
1   -1.731249
2   -1.786543
3   -1.848157
4   -1.894173
Name: lifeExp, dtype: float64


In [57]:
# 데이터의 양이 줄어들디 않음을 확인
print(df.shape)

(1704, 6)


In [58]:
# 데이터의 양이 줄어들디 않음을 확인(1704호 원본데이터와 동일)
print(transform_z.shape)

(1704,)


In [60]:
a = df.head(2)
print(a)
print('\n-----------------------------------------------------')

b = df.groupby('year').lifeExp.mean().head(2)
print(b)
print('\n-----------------------------------------------------')

c = df.groupby('year').lifeExp.std().head(2)
print(c)
print('\n-----------------------------------------------------')



       country continent  year  lifeExp      pop   gdpPercap
0  Afghanistan      Asia  1952   28.801  8425333  779.445314
1  Afghanistan      Asia  1957   30.332  9240934  820.853030

-----------------------------------------------------
year
1952    49.057620
1957    51.507401
Name: lifeExp, dtype: float64

-----------------------------------------------------
year
1952    12.225956
1957    12.231286
Name: lifeExp, dtype: float64

-----------------------------------------------------


# 누락값을 평균값으로 처리

In [61]:
import seaborn as sns
import numpy as np

# sample()을 활용한 무작위 샘플 데이터 만들기
# 무작위 표본추출하는데'np.random.permutation' 함수를 사용해 순열을 무작위로 석은 후에 n개만큼 indexing해 오는 방법을 사용

# total_bill 열의 값 4개를임의로 선택해 누락갑ㄱ으로 바꿈 

np.random.seed(42)
tips_10 = sns.load_dataset('tips').sample(10)
tips_10.loc[np.random.permutation(tips_10.index)[:4], 'total_bill'] = np.NaN

print(tips_10)

     total_bill   tip     sex smoker   day    time  size
24        19.82  3.18    Male     No   Sat  Dinner     2
6          8.77  2.00    Male     No   Sun  Dinner     2
153         NaN  2.00    Male     No   Sun  Dinner     4
211         NaN  5.16    Male    Yes   Sat  Dinner     4
198         NaN  2.00  Female    Yes  Thur   Lunch     2
176         NaN  2.00    Male    Yes   Sun  Dinner     2
192       28.44  2.56    Male    Yes  Thur   Lunch     2
124       12.48  2.52  Female     No  Thur   Lunch     2
9         14.78  3.23    Male     No   Sun  Dinner     2
101       15.38  3.00  Female    Yes   Fri  Dinner     2


In [62]:
# total_bill 4개 누락값이므로 6개만 나옴, 다른 값은 다 10개 나옴 
# 누락값 - 여성1명, 남성3명

count_sex = tips_10.groupby('sex').count()
print(count_sex)

        total_bill  tip  smoker  day  time  size
sex                                             
Male             4    7       7    7     7     7
Female           2    3       3    3     3     3


In [63]:
def fill_na_mean(x):
    avg = x.mean()
    return x.fillna(avg)

In [65]:
# 성별을 구분하여 total_bill 열의 평균을구하는 함수를 적용
# tips_10에 새로운 열 (fill_total_bill)
# 남성과 여성의 누락값을 고려하여 계산한 평균값으로 채움

total_bill_group_mean = tips_10.groupby('sex').total_bill.transform(fill_na_mean)
tips_10['fill_total_bill'] = total_bill_group_mean
print(tips_10)

     total_bill   tip     sex smoker   day    time  size  fill_total_bill
24        19.82  3.18    Male     No   Sat  Dinner     2          19.8200
6          8.77  2.00    Male     No   Sun  Dinner     2           8.7700
153         NaN  2.00    Male     No   Sun  Dinner     4          17.9525
211         NaN  5.16    Male    Yes   Sat  Dinner     4          17.9525
198         NaN  2.00  Female    Yes  Thur   Lunch     2          13.9300
176         NaN  2.00    Male    Yes   Sun  Dinner     2          17.9525
192       28.44  2.56    Male    Yes  Thur   Lunch     2          28.4400
124       12.48  2.52  Female     No  Thur   Lunch     2          12.4800
9         14.78  3.23    Male     No   Sun  Dinner     2          14.7800
101       15.38  3.00  Female    Yes   Fri  Dinner     2          15.3800


# 데이터 필터링 사용하기 - filter 메서드

In [66]:
import seaborn as sns

tips = sns.load_dataset('tips')
print(tips.shape)

(244, 7)


In [67]:
# size 열의 데이터 수를 확인
# 5,6,1 테이블의 주문이 매우 적다

print(tips['size'].value_counts())


2    156
3     38
4     37
5      5
6      4
1      4
Name: size, dtype: int64


In [69]:
# 주문이 30번 이상 있는 테이블만 필터링
# lambda = 

tips_filtered = tips.groupby('size').filter(lambda x: x['size'].count() >= 30)

In [70]:
# 222-1, 216-5 주문이 적은 1,5번 테이블을 제외하고 가져옴 

tips_filtered.tail(30)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
212,48.33,9.0,Male,No,Sat,Dinner,4
213,13.27,2.5,Female,Yes,Sat,Dinner,2
214,28.17,6.5,Female,Yes,Sat,Dinner,3
215,12.9,1.1,Female,Yes,Sat,Dinner,2
217,11.59,1.5,Male,Yes,Sat,Dinner,2
218,7.74,1.44,Male,Yes,Sat,Dinner,2
219,30.14,3.09,Female,Yes,Sat,Dinner,4
220,12.16,2.2,Male,Yes,Fri,Lunch,2
221,13.42,3.48,Female,Yes,Fri,Lunch,2
223,15.98,3.0,Female,No,Fri,Lunch,3


In [71]:
print(tips_filtered['size'].value_counts())

2    156
3     38
4     37
Name: size, dtype: int64


In [75]:
f = lambda x: x+1 # lambda = 인자 : 표현식 
f(4)

5

In [79]:
def hap(x, y)
    return x + y # 위의 lambda 와 같은 함수 

SyntaxError: invalid syntax (<ipython-input-79-5aa5c1328297>, line 1)

# 그룹 오브젝트 저장하여 살펴보기 

In [80]:
tips_10 = sns.load_dataset('tips').sample(10, random_state=42)
print(tips_10)

     total_bill   tip     sex smoker   day    time  size
24        19.82  3.18    Male     No   Sat  Dinner     2
6          8.77  2.00    Male     No   Sun  Dinner     2
153       24.55  2.00    Male     No   Sun  Dinner     4
211       25.89  5.16    Male    Yes   Sat  Dinner     4
198       13.00  2.00  Female    Yes  Thur   Lunch     2
176       17.89  2.00    Male    Yes   Sun  Dinner     2
192       28.44  2.56    Male    Yes  Thur   Lunch     2
124       12.48  2.52  Female     No  Thur   Lunch     2
9         14.78  3.23    Male     No   Sun  Dinner     2
101       15.38  3.00  Female    Yes   Fri  Dinner     2


In [83]:
#자료형이 그룹 오브젝트임을 확인

grouped = tips_10.groupby('sex')
print(grouped)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000146DB8C6E88>


In [87]:
# 그룹 오브젝트에 포함된 그룹을 보려면 groups 속성을 출력
print(grouped.groups)

{'Male': Int64Index([24, 6, 153, 211, 176, 192, 9], dtype='int64'), 'Female': Int64Index([198, 124, 101], dtype='int64')}


# 그룹 오브젝트의 평균 구하기

In [90]:
# 그룹 오브젝트를 이용하여 평균을 구함
# smoker, day, time 열은 그룹 연산에서 제외됨
# 파이썬은 그룹 연산에 적합한 열을 알아서 골라줌 

avgs = grouped.mean()

print(avgs)

        total_bill       tip      size
sex                                   
Male         20.02  2.875714  2.571429
Female       13.62  2.506667  2.000000


In [91]:
print(tips_10.columns)

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')


# 그룹 오브젝트에서 데이터 추출하고 반복하기

In [92]:
female = grouped.get_group('Female')
print(female)

     total_bill   tip     sex smoker   day    time  size
198       13.00  2.00  Female    Yes  Thur   Lunch     2
124       12.48  2.52  Female     No  Thur   Lunch     2
101       15.38  3.00  Female    Yes   Fri  Dinner     2


# 그룹 오브젝트 계산하고 살펴보기

In [93]:
bill_sex_time = tips_10.groupby(['sex', 'time'])
group_avg = bill_sex_time.mean()

print(group_avg)

               total_bill       tip      size
sex    time                                  
Male   Lunch    28.440000  2.560000  2.000000
       Dinner   18.616667  2.928333  2.666667
Female Lunch    12.740000  2.260000  2.000000
       Dinner   15.380000  3.000000  2.000000


In [94]:
print(type(group_avg))

<class 'pandas.core.frame.DataFrame'>


In [95]:
print(group_avg)

               total_bill       tip      size
sex    time                                  
Male   Lunch    28.440000  2.560000  2.000000
       Dinner   18.616667  2.928333  2.666667
Female Lunch    12.740000  2.260000  2.000000
       Dinner   15.380000  3.000000  2.000000


In [96]:
print(group_avg.columns)

Index(['total_bill', 'tip', 'size'], dtype='object')


In [97]:
print(group_avg.index)

MultiIndex([(  'Male',  'Lunch'),
            (  'Male', 'Dinner'),
            ('Female',  'Lunch'),
            ('Female', 'Dinner')],
           names=['sex', 'time'])


In [99]:
# reset_index 메서드 대신 as_index=False 를 설정해도 같은 결과가 나옴 

group_method = tips_10.groupby(['sex','time']).mean().reset_index()
print(group_method)

      sex    time  total_bill       tip      size
0    Male   Lunch   28.440000  2.560000  2.000000
1    Male  Dinner   18.616667  2.928333  2.666667
2  Female   Lunch   12.740000  2.260000  2.000000
3  Female  Dinner   15.380000  3.000000  2.000000


# --------------------------------------------문제---------------------------------------

EDA(Exploratory Data Analysis, 탐색적 데이터 분석)

1.chipotle.tsv 파일을 로드하세요.

In [100]:
import pandas as pd
ch = pd.read_csv('../data/chipotle.tsv', sep='\t')
ch

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",$11.75
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",$11.75
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",$11.25
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",$8.75


2.행과 열을 확인하세요.

In [105]:
print(ch.shape)

(4622, 5)


3.그외 기본 정보 확인하세요.

In [130]:
print(ch.head())

   order_id  quantity                              item_name  \
0         1         1           Chips and Fresh Tomato Salsa   
1         1         1                                   Izze   
2         1         1                       Nantucket Nectar   
3         1         1  Chips and Tomatillo-Green Chili Salsa   
4         2         2                           Chicken Bowl   

                                  choice_description item_price order_id_str  
0                                                NaN     $2.39             1  
1                                       [Clementine]     $3.39             1  
2                                            [Apple]     $3.39             1  
3                                                NaN     $2.39             1  
4  [Tomatillo-Red Chili Salsa (Hot), [Black Beans...    $16.98             2  


4. 10개의 row 데이터를 보여줍니다.

In [108]:
print(ch.head(10))

   order_id  quantity                              item_name  \
0         1         1           Chips and Fresh Tomato Salsa   
1         1         1                                   Izze   
2         1         1                       Nantucket Nectar   
3         1         1  Chips and Tomatillo-Green Chili Salsa   
4         2         2                           Chicken Bowl   
5         3         1                           Chicken Bowl   
6         3         1                          Side of Chips   
7         4         1                          Steak Burrito   
8         4         1                       Steak Soft Tacos   
9         5         1                          Steak Burrito   

                                  choice_description item_price  
0                                                NaN     $2.39   
1                                       [Clementine]     $3.39   
2                                            [Apple]     $3.39   
3                              

5. 칼럼들 출력

In [109]:
print(ch.columns)

Index(['order_id', 'quantity', 'item_name', 'choice_description',
       'item_price'],
      dtype='object')


6. index 출력

In [110]:
print(ch.index)

RangeIndex(start=0, stop=4622, step=1)


7. 기초통계량

In [111]:
import pandas as pd
import numpy as np

dfdf = ch.describe()
print(dfdf)

          order_id     quantity
count  4622.000000  4622.000000
mean    927.254868     1.075725
std     528.890796     0.410186
min       1.000000     1.000000
25%     477.250000     1.000000
50%     926.000000     1.000000
75%    1393.000000     1.000000
max    1834.000000    15.000000


8. order_id str로 변환

In [123]:
ch['order_id_str'] = ch['order_id'].astype(str)

print(ch.dtypes)

print('\n-----------------------------------------------------')

print(ch.head())

order_id               int64
quantity               int64
item_name             object
choice_description    object
item_price            object
order_id_str          object
dtype: object

-----------------------------------------------------
   order_id  quantity                              item_name  \
0         1         1           Chips and Fresh Tomato Salsa   
1         1         1                                   Izze   
2         1         1                       Nantucket Nectar   
3         1         1  Chips and Tomatillo-Green Chili Salsa   
4         2         2                           Chicken Bowl   

                                  choice_description item_price order_id_str  
0                                                NaN     $2.39             1  
1                                       [Clementine]     $3.39             1  
2                                            [Apple]     $3.39             1  
3                                                NaN    

9. order_id의 개수를 출력

In [124]:
ch['order_id'].count()

4622

10. item_name의 개수를 출력

In [126]:
ch['item_name'].count()

4622

11. 가장 많이 주문한 item : top 5을 출력

In [131]:
ch['item_name'].value_counts().head(5)

Chicken Bowl           726
Chicken Burrito        553
Chips and Guacamole    479
Steak Burrito          368
Canned Soft Drink      301
Name: item_name, dtype: int64

12. item당 주문 개수를 출력

In [132]:
print(ch['item_name'].value_counts())

Chicken Bowl                             726
Chicken Burrito                          553
Chips and Guacamole                      479
Steak Burrito                            368
Canned Soft Drink                        301
Chips                                    211
Steak Bowl                               211
Bottled Water                            162
Chicken Soft Tacos                       115
Chicken Salad Bowl                       110
Chips and Fresh Tomato Salsa             110
Canned Soda                              104
Side of Chips                            101
Veggie Burrito                            95
Barbacoa Burrito                          91
Veggie Bowl                               85
Carnitas Bowl                             68
Barbacoa Bowl                             66
Carnitas Burrito                          59
Steak Soft Tacos                          55
6 Pack Soft Drink                         54
Chips and Tomatillo Red Chili Salsa       48
Chicken Cr

# ---------------------------------------문제2-----------------------------

전국민 카드 및 대출 이용통계 데이터 연령대/거주지/성별 구분에
따른 카드 및 대출 이용통계 데이터로, 구분 값 별 카드이용금액 비교,
대출금액 및 대출 특이사항 비교 등이 가능


In [134]:
import pandas as pd
ca = pd.read_csv('../data/credit_card_data.csv', sep='\t')
ca.head(10)

Unnamed: 0,"pop_cd,year,month,avg_score,avg_rat,city,sex,ages,population,num_opencard,num_usecard,monthly_card_spend,monthly_lc,monthly_loan,monthly_bk_loan,monthly_cd_loan,monthly_installments_loan,monthly_insurance_loan,monthly_sbk_loan,loan_commitment,inst_rep_loanb,ls_rep_loanb,credit_loan,mortgage_loan,credit_card_payment,credit_card_installments_payment"
0,"L011,2016,1,812,2,서울,,10대,55000,1.44,1.13,7000..."
1,"L011,2016,2,773,4,서울,,10대,380000,1.71,0.96,170..."
2,"L011,2016,3,771,4,서울,,10대,387000,1.75,0.95,190..."
3,"L011,2016,4,770,4,서울,,10대,412000,1.76,0.92,200..."
4,"L011,2016,5,771,4,서울,,10대,414000,1.75,0.94,170..."
5,"L011,2016,6,771,4,서울,,10대,410000,1.75,0.96,190..."
6,"L011,2016,7,772,4,서울,,10대,407000,1.75,0.97,180..."
7,"L011,2016,8,772,4,서울,,10대,403000,1.75,0.98,170..."
8,"L011,2016,9,772,4,서울,,10대,399000,1.74,0.99,190..."
9,"L011,2016,10,773,4,서울,,10대,395000,1.73,0.98,18..."


1. 연령대별 월 카드 이용 총 금액

2. 지역별 월 카드 이용 총 금액

3. 30대의 총 대출 금액은 얼마인가요?

4. 30대의 월 카드 이용 금액을 그래프로 나타내 보세요.