#### 데이터프레임 그룹 분석

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

##### pivot 메서드

In [2]:
data = {
     "도시":["서울","서울","서울","부산","부산","부산","인천","인천"],
     "연도":["2015","2010","2005","2015","2010","2005","2015","2010"],
     "인구":[9904312, 9631482, 9762546, 3448737, 3393191, 3512547, 2890451, 2632035]     
 }
 
columns = ["도시","연도","인구"]
df1 = pd.DataFrame(data=data, columns=columns)
print(df1)

   도시    연도       인구
0  서울  2015  9904312
1  서울  2010  9631482
2  서울  2005  9762546
3  부산  2015  3448737
4  부산  2010  3393191
5  부산  2005  3512547
6  인천  2015  2890451
7  인천  2010  2632035


In [3]:
df1.columns

Index(['도시', '연도', '인구'], dtype='object')

In [4]:
def cityToMetro(c):
    if(c in ["서울", "인천"]): return "수도권"
    elif(c in ["부산"]): return "경상권"

In [5]:
df1["지역"] = df1.apply(lambda x : cityToMetro(x["도시"]), axis=1)

In [6]:
df1

Unnamed: 0,도시,연도,인구,지역
0,서울,2015,9904312,수도권
1,서울,2010,9631482,수도권
2,서울,2005,9762546,수도권
3,부산,2015,3448737,경상권
4,부산,2010,3393191,경상권
5,부산,2005,3512547,경상권
6,인천,2015,2890451,수도권
7,인천,2010,2632035,수도권


In [7]:
# 행 인덱스, 열 인덱스, 데이터
df1.pivot(index='도시', columns='연도', values='인구')

연도,2005,2010,2015
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
부산,3512547.0,3393191.0,3448737.0
서울,9762546.0,9631482.0,9904312.0
인천,,2632035.0,2890451.0


In [8]:
df1.pivot(index=['지역','도시'], columns='연도', values='인구')

Unnamed: 0_level_0,연도,2005,2010,2015
지역,도시,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
경상권,부산,3512547.0,3393191.0,3448737.0
수도권,서울,9762546.0,9631482.0,9904312.0
수도권,인천,,2632035.0,2890451.0


##### 그룹 분석

In [11]:
iris = sns.load_dataset('iris')
iris.info()
iris

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [15]:
# species별로 sepal_length, sepal_width, petal_length, petal_width의 평균을 구하기
iris.groupby(by='species').mean().round(2)

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.01,3.43,1.46,0.25
versicolor,5.94,2.77,4.26,1.33
virginica,6.59,2.97,5.55,2.03


In [19]:
# species별로 각 속성의 평균, 표준편차를 구하기
iris.groupby(by='species').agg(func=['mean', 'median', 'std']).round(2)

Unnamed: 0_level_0,sepal_length,sepal_length,sepal_length,sepal_width,sepal_width,sepal_width,petal_length,petal_length,petal_length,petal_width,petal_width,petal_width
Unnamed: 0_level_1,mean,median,std,mean,median,std,mean,median,std,mean,median,std
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
setosa,5.01,5.0,0.35,3.43,3.4,0.38,1.46,1.5,0.17,0.25,0.2,0.11
versicolor,5.94,5.9,0.52,2.77,2.8,0.31,4.26,4.35,0.47,1.33,1.3,0.2
virginica,6.59,6.5,0.64,2.97,3.0,0.32,5.55,5.55,0.55,2.03,2.0,0.27


##### bt_df.csv 불러와서 groupby 이용해서 분석하기

In [20]:
filepath = '../bt_df.csv'
df1 = pd.read_csv(filepath)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          100 non-null    object 
 1   sex           100 non-null    object 
 2   age           78 non-null     float64
 3   blood         93 non-null     object 
 4   country       93 non-null     object 
 5   math          88 non-null     float64
 6   verbal        86 non-null     float64
 7   python        85 non-null     float64
 8   target        100 non-null    object 
 9   passed        100 non-null    int64  
 10  review        100 non-null    object 
 11  review_score  100 non-null    float64
dtypes: float64(5), int64(1), object(6)
memory usage: 9.5+ KB


In [22]:
cols = ['sex','age','blood','country','math','verbal','python', 'passed', 'review_score']
df1 = df1[cols]
df1.tail()

Unnamed: 0,sex,age,blood,country,math,verbal,python,passed,review_score
95,male,,B,USA,83.0,,,0,6.9
96,male,71.0,A,USA,,70.0,62.25,0,1.5
97,male,23.0,A,Korea,84.0,63.0,74.51,0,4.0
98,female,34.0,O,USA,55.0,58.0,91.67,0,2.4
99,male,34.0,A,,88.0,91.0,87.75,0,3.4


In [23]:
# 나라별로 math, verbal, python 성적의 평균, 중간값, 표준편차 구하기
df1.groupby(by='country')[['math', 'verbal', 'python']].agg(func=['mean','median','std']).round(2)

Unnamed: 0_level_0,math,math,math,verbal,verbal,verbal,python,python,python
Unnamed: 0_level_1,mean,median,std,mean,median,std,mean,median,std
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Japan,68.92,73.0,16.95,71.42,72.5,13.55,67.6,70.59,16.18
Korea,66.42,60.0,18.83,76.0,80.0,11.61,68.24,68.38,14.07
USA,72.08,72.0,16.13,70.2,68.0,14.33,64.4,58.33,15.0


In [24]:
df2 = pd.DataFrame(data={'key1':["A","A","B","B","A"],
                         'key2':["one","two","one","two","one"],
                         'data1':[1,2,3,4,5],
                         'data2':[10,20,30,40,50]})
df2

Unnamed: 0,key1,key2,data1,data2
0,A,one,1,10
1,A,two,2,20
2,B,one,3,30
3,B,two,4,40
4,A,one,5,50


In [25]:
df2.groupby(by='key1').sum()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
A,8,80
B,7,70


In [26]:
df2.groupby(by='key1')['data2'].sum()

key1
A    80
B    70
Name: data2, dtype: int64

In [27]:
df2.data2.groupby(df2.key1).sum()

key1
A    80
B    70
Name: data2, dtype: int64