#### 데이터프레임 그룹 분석

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

##### pivot 메서드

In [2]:
data = {
     "도시":["서울","서울","서울","부산","부산","부산","인천","인천"],
     "연도":["2015","2010","2005","2015","2010","2005","2015","2010"],
     "인구":[9904312, 9631482, 9762546, 3448737, 3393191, 3512547, 2890451, 2632035]     
 }
 
columns = ["도시","연도","인구"]
df1 = pd.DataFrame(data=data, columns=columns)
print(df1)

   도시    연도       인구
0  서울  2015  9904312
1  서울  2010  9631482
2  서울  2005  9762546
3  부산  2015  3448737
4  부산  2010  3393191
5  부산  2005  3512547
6  인천  2015  2890451
7  인천  2010  2632035


In [3]:
df1.columns

Index(['도시', '연도', '인구'], dtype='object')

In [4]:
def cityToMetro(c):
    if(c in ["서울", "인천"]): return "수도권"
    elif(c in ["부산"]): return "경상권"

In [5]:
df1["지역"] = df1.apply(lambda x : cityToMetro(x["도시"]), axis=1)

In [6]:
df1

Unnamed: 0,도시,연도,인구,지역
0,서울,2015,9904312,수도권
1,서울,2010,9631482,수도권
2,서울,2005,9762546,수도권
3,부산,2015,3448737,경상권
4,부산,2010,3393191,경상권
5,부산,2005,3512547,경상권
6,인천,2015,2890451,수도권
7,인천,2010,2632035,수도권


In [7]:
# 행 인덱스, 열 인덱스, 데이터
df1.pivot(index='도시', columns='연도', values='인구')

연도,2005,2010,2015
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
부산,3512547.0,3393191.0,3448737.0
서울,9762546.0,9631482.0,9904312.0
인천,,2632035.0,2890451.0


In [8]:
df1.pivot(index=['지역','도시'], columns='연도', values='인구')

Unnamed: 0_level_0,연도,2005,2010,2015
지역,도시,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
경상권,부산,3512547.0,3393191.0,3448737.0
수도권,서울,9762546.0,9631482.0,9904312.0
수도권,인천,,2632035.0,2890451.0


##### 그룹 분석

In [9]:
iris = sns.load_dataset('iris')
iris.info()
iris

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [10]:
# species별로 sepal_length, sepal_width, petal_length, petal_width의 평균을 구하기
iris.groupby(by='species').mean().round(2)

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.01,3.43,1.46,0.25
versicolor,5.94,2.77,4.26,1.33
virginica,6.59,2.97,5.55,2.03


In [11]:
# species별로 각 속성의 평균, 표준편차를 구하기
iris.groupby(by='species').agg(func=['mean', 'median', 'std']).round(2)

Unnamed: 0_level_0,sepal_length,sepal_length,sepal_length,sepal_width,sepal_width,sepal_width,petal_length,petal_length,petal_length,petal_width,petal_width,petal_width
Unnamed: 0_level_1,mean,median,std,mean,median,std,mean,median,std,mean,median,std
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
setosa,5.01,5.0,0.35,3.43,3.4,0.38,1.46,1.5,0.17,0.25,0.2,0.11
versicolor,5.94,5.9,0.52,2.77,2.8,0.31,4.26,4.35,0.47,1.33,1.3,0.2
virginica,6.59,6.5,0.64,2.97,3.0,0.32,5.55,5.55,0.55,2.03,2.0,0.27


##### bt_df.csv 불러와서 groupby 이용해서 분석하기

In [12]:
filepath = '../bt_df.csv'
df1 = pd.read_csv(filepath)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          100 non-null    object 
 1   sex           100 non-null    object 
 2   age           78 non-null     float64
 3   blood         93 non-null     object 
 4   country       93 non-null     object 
 5   math          88 non-null     float64
 6   verbal        86 non-null     float64
 7   python        85 non-null     float64
 8   target        100 non-null    object 
 9   passed        100 non-null    int64  
 10  review        100 non-null    object 
 11  review_score  100 non-null    float64
dtypes: float64(5), int64(1), object(6)
memory usage: 9.5+ KB


In [13]:
cols = ['sex','age','blood','country','math','verbal','python', 'passed', 'review_score']
df1 = df1[cols]
df1.tail()

Unnamed: 0,sex,age,blood,country,math,verbal,python,passed,review_score
95,male,,B,USA,83.0,,,0,6.9
96,male,71.0,A,USA,,70.0,62.25,0,1.5
97,male,23.0,A,Korea,84.0,63.0,74.51,0,4.0
98,female,34.0,O,USA,55.0,58.0,91.67,0,2.4
99,male,34.0,A,,88.0,91.0,87.75,0,3.4


In [14]:
# 나라별로 math, verbal, python 성적의 평균, 중간값, 표준편차 구하기
df1.groupby(by='country')[['math', 'verbal', 'python']].agg(func=['mean','median','std']).round(2)

Unnamed: 0_level_0,math,math,math,verbal,verbal,verbal,python,python,python
Unnamed: 0_level_1,mean,median,std,mean,median,std,mean,median,std
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Japan,68.92,73.0,16.95,71.42,72.5,13.55,67.6,70.59,16.18
Korea,66.42,60.0,18.83,76.0,80.0,11.61,68.24,68.38,14.07
USA,72.08,72.0,16.13,70.2,68.0,14.33,64.4,58.33,15.0


In [15]:
df2 = pd.DataFrame(data={'key1':["A","A","B","B","A"],
                         'key2':["one","two","one","two","one"],
                         'data1':[1,2,3,4,5],
                         'data2':[10,20,30,40,50]})
df2

Unnamed: 0,key1,key2,data1,data2
0,A,one,1,10
1,A,two,2,20
2,B,one,3,30
3,B,two,4,40
4,A,one,5,50


In [16]:
df2.groupby(by='key1').sum()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
A,8,80
B,7,70


In [17]:
df2.groupby(by='key1')['data2'].sum()

key1
A    80
B    70
Name: data2, dtype: int64

In [18]:
df2.data2.groupby(df2.key1).sum()

key1
A    80
B    70
Name: data2, dtype: int64

##### 내가 만든 함수 적용하기

In [19]:
def peak_to_peak_ratio(x):
    return x.max() / x.min()

In [20]:
iris.groupby(by='species').agg(func=peak_to_peak_ratio).round(2)

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,1.35,1.91,1.9,6.0
versicolor,1.43,1.7,1.7,1.8
virginica,1.61,1.73,1.53,1.79


In [21]:
df1.groupby(by='country')[['math','verbal','python']].agg(func=peak_to_peak_ratio).round(2)

Unnamed: 0_level_0,math,verbal,python
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Japan,2.45,1.88,2.1
Korea,2.38,1.77,2.01
USA,2.48,1.9,2.09


In [22]:
df1.groupby(by='country')[['math','verbal','python']].agg(func=lambda x:x.max()/x.min()).round(2)

Unnamed: 0_level_0,math,verbal,python
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Japan,2.45,1.88,2.1
Korea,2.38,1.77,2.01
USA,2.48,1.9,2.09


In [23]:
df1.groupby(by='country')[['math','verbal','python']].apply(lambda x : x.max()/x.min()).round(2)

Unnamed: 0_level_0,math,verbal,python
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Japan,2.45,1.88,2.1
Korea,2.38,1.77,2.01
USA,2.48,1.9,2.09


In [24]:
df1.groupby(by='country').describe().round(2)

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,math,math,...,passed,passed,review_score,review_score,review_score,review_score,review_score,review_score,review_score,review_score
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Japan,22.0,24.11,10.05,2.0,20.25,25.5,28.88,49.0,24.0,68.92,...,0.0,0.0,26.0,5.72,2.08,1.0,4.22,6.05,6.95,10.0
Korea,18.0,24.83,16.74,4.0,14.25,22.0,30.5,65.0,19.0,66.42,...,1.0,1.0,23.0,4.74,2.64,0.2,3.3,4.8,6.7,9.8
USA,31.0,29.67,17.39,0.83,18.5,29.0,38.0,71.0,39.0,72.08,...,0.0,0.0,44.0,4.76,2.64,0.4,2.8,4.35,6.9,9.9


In [25]:
df1.groupby(by='country').describe().round(2).T

Unnamed: 0,country,Japan,Korea,USA
age,count,22.0,18.0,31.0
age,mean,24.11,24.83,29.67
age,std,10.05,16.74,17.39
age,min,2.0,4.0,0.83
age,25%,20.25,14.25,18.5
age,50%,25.5,22.0,29.0
age,75%,28.88,30.5,38.0
age,max,49.0,65.0,71.0
math,count,24.0,19.0,39.0
math,mean,68.92,66.42,72.08


##### mpg 데이터셋

In [26]:
mpg = sns.load_dataset('mpg')
mpg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [40]:
mpg.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,manufacturer
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu,chevrolet
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320,buick
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite,plymouth
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst,amc
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino,ford


In [55]:
mpg.name.apply(lambda x : ' '.join(x.split(' ')[1:]))

0      chevelle malibu
1          skylark 320
2            satellite
3            rebel sst
4               torino
            ...       
393         mustang gl
394             pickup
395            rampage
396             ranger
397               s-10
Name: name, Length: 398, dtype: object

In [67]:
mpg['manufacturer'] = mpg.name.apply(lambda x : x.split(' ')[0])
mpg['model'] = mpg.name.apply(lambda x : ' '.join(x.split(' ')[1:]) if len(x.split(' ')) > 1 else np.nan)

In [68]:
mpg[['name', 'manufacturer', 'model']]

Unnamed: 0,name,manufacturer,model
0,chevrolet chevelle malibu,chevrolet,chevelle malibu
1,buick skylark 320,buick,skylark 320
2,plymouth satellite,plymouth,satellite
3,amc rebel sst,amc,rebel sst
4,ford torino,ford,torino
...,...,...,...
393,ford mustang gl,ford,mustang gl
394,vw pickup,vw,pickup
395,dodge rampage,dodge,rampage
396,ford ranger,ford,ranger


In [69]:
mpg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
 9   manufacturer  398 non-null    object 
 10  model         396 non-null    object 
dtypes: float64(4), int64(3), object(4)
memory usage: 34.3+ KB


In [70]:
def q3cut(s):
    return pd.qcut(x=s, q=3, labels=['소','중','대']).astype(str)

In [71]:
iris["petal_length_class"] = iris.groupby(by='species')["petal_length"].apply(q3cut)

In [73]:
# transform 메서드 이용하기
iris["sepal_width_class"] = iris.groupby(by='species')['sepal_width'].transform(q3cut)

In [74]:
iris[['petal_length', 'petal_length_class', 'sepal_width', 'sepal_width_class']].tail(10)

Unnamed: 0,petal_length,petal_length_class,sepal_width,sepal_width_class
140,5.6,중,3.1,대
141,5.1,소,3.1,대
142,5.1,소,2.7,소
143,5.9,대,3.2,대
144,5.7,중,3.3,대
145,5.2,소,3.0,중
146,5.0,소,2.5,소
147,5.2,소,3.0,중
148,5.4,중,3.4,대
149,5.1,소,3.0,중
