## 판다스 통계요약

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [2]:
df.describe() # 숫자형 컬럼에 대해서만 통계량이 나옴

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [3]:
df.describe(include='object') # 객체형에 대해서 (숫자형이 아닌 컬럼) 나옴 

Unnamed: 0,sex,embarked,who,embark_town,alive
count,891,889,891,889,891
unique,2,3,3,3,2
top,male,S,man,Southampton,no
freq,577,644,537,644,549


- count

In [4]:
df.count() # 컬럼별 개수가 나옴 / defalt: axis = 0 / null 값은 카운트 안됌

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

In [5]:
df['age'].count()

714

In [6]:
# count 함수는 notnull 인 값들의 개수
df['age'].notnull().sum()

714

- mean(평균)

In [8]:
df.mean() # defalt: axis = 0

  df.mean() # defalt: axis = 0


survived       0.383838
pclass         2.308642
age           29.699118
sibsp          0.523008
parch          0.381594
fare          32.204208
adult_male     0.602694
alone          0.602694
dtype: float64

In [9]:
df['age'].mean()

29.69911764705882

In [11]:
# 성인 남자의 나이 평균
df.loc[df['adult_male'] == True, 'age'].mean()

33.17312348668281

In [12]:
# skipna 파라미터의 디폴트 값이 트루 ( 널값 제외하고 계산 )
df.mean(numeric_only=True, skipna=False)

survived       0.383838
pclass         2.308642
age                 NaN
sibsp          0.523008
parch          0.381594
fare          32.204208
adult_male     0.602694
alone          0.602694
dtype: float64

In [13]:
# 참고: 결측치를 0으로 채우고 평균 구하기
df['age'].fillna(0).mean()

23.79929292929293

In [17]:
#아이 who  나이의 평균
df.loc[df['who'] == 'child', 'age'].mean()

6.369518072289156

- median (중앙값, 제 2 사분위 수)

In [18]:
df.median() # df.median(numeric_only = True)

  df.median()


survived       0.0000
pclass         3.0000
age           28.0000
sibsp          0.0000
parch          0.0000
fare          14.4542
adult_male     1.0000
alone          1.0000
dtype: float64

In [20]:
# 원소의 개수가 홀수일 때
pd.Series([1,2,3,4,5]).median()

3.0

In [23]:
# 원소의 개수가 짝수일 때
pd.Series([1,2,3,4,5,6]).median() # 개수가 짝수이면 (3+4)/2 가운데 두 개의 값의 평균이랑 같다

3.5

In [24]:
print(df['age'].mean())
print(df['age'].median())
# 평균과 중앙값은 항상 동일하지 않다.

29.69911764705882
28.0


- sum

In [25]:
# 여러 컬럼을 기준으로 계산
df.sum()

  df.sum()


survived                                                    342
pclass                                                     2057
sex           malefemalefemalefemalemalemalemalemalefemalefe...
age                                                    21205.17
sibsp                                                       466
parch                                                       340
fare                                                 28693.9493
who           manwomanwomanwomanmanmanmanchildwomanchildchil...
adult_male                                                  537
alive         noyesyesyesnonononoyesyesyesyesnononoyesnoyesn...
alone                                                       537
dtype: object

In [26]:
# 두 컬럼의 합계 계산
df.loc[:, ['survived', 'alone']].sum()

survived    342
alone       537
dtype: int64

In [27]:
# 성인 남자의 생존자 수와 혼자 있었던 사람 수
df.loc[df['who'] == 'man', ['survived', 'alone']].sum()

survived     88
alone       410
dtype: int64

In [28]:
# 성인 여자의 생존자 수와 혼자 있었던 사람 수
df.loc[df['who'] == 'woman', ['survived', 'alone']].sum()

survived    205
alone       121
dtype: int64

In [29]:
# 아이의 생존자 수와 혼자 있었던 사람 수
df.loc[df['who'] == 'child', ['survived', 'alone']].sum()

survived    49
alone        6
dtype: int64

- cumcum (누적 합)

In [30]:
# 계속 앞에 있는 값을 더함 => 누적합
df['survived'].cumsum()

0        0
1        1
2        2
3        3
4        3
      ... 
886    340
887    341
888    341
889    342
890    342
Name: survived, Length: 891, dtype: int64

In [32]:
pd.concat([df['survived'], df['survived'].cumsum()], axis=1)

Unnamed: 0,survived,survived.1
0,0,0
1,1,1
2,1,2
3,1,3
4,0,3
...,...,...
886,0,340
887,1,341
888,0,341
889,1,342


- 분산

df['']

In [33]:
df['fare'].var()

2469.436845743117

In [34]:
# 분산 계산 식
((df['fare'] - df['fare'].mean()) ** 2).sum() / (df['fare'].count() - 1)

2469.436845743116

- std (표준편차)

In [35]:
df['fare'].std()

49.693428597180905

In [37]:
# 루트 계산
np.sqrt(df['fare'].var())

49.693428597180905

- min, max

In [38]:
df['age'].min()

0.42

In [39]:
df['age'].max()

80.0

In [40]:
df.loc[:, ['age', 'fare']].min()

age     0.42
fare    0.00
dtype: float64

In [41]:
df.loc[:, ['age', 'fare']].max()

age      80.0000
fare    512.3292
dtype: float64

- agg ( 여러 통계값 한 번에 계산)

In [42]:
df['age'].agg(['min', 'median', 'mean', 'max'])

min        0.420000
median    28.000000
mean      29.699118
max       80.000000
Name: age, dtype: float64

In [43]:
df[['age', 'fare']].agg(['count','min', 'median', 'mean', 'max', 'std'])

Unnamed: 0,age,fare
count,714.0,891.0
min,0.42,0.0
median,28.0,14.4542
mean,29.699118,32.204208
max,80.0,512.3292
std,14.526497,49.693429


In [44]:
# 성인 남자에 대해서만 계산
df.loc[df['who'] == 'man', ['age', 'fare']].agg(['count','min', 'median', 'mean', 'max', 'std'])

Unnamed: 0,age,fare
count,413.0,537.0
min,16.0,0.0
median,30.0,9.5
mean,33.173123,24.864182
max,80.0,512.3292
std,12.906908,44.021339


- quantile (0.25: 1 사분위 수, 0.5: 2 사분위 수, 0.75: 3 사분위 수)

In [45]:
df['age'].quantile(0)

0.42

In [46]:
df['age'].quantile(0.5)

28.0

In [48]:
df['age'].median()

28.0

In [47]:
df['age'].quantile(0.75)

38.0

In [49]:
# IQR
iqr_age = abs(df['age'].quantile(0.25) - df['age'].quantile(0.75)) # abs 함수 절대값 
iqr_age

17.875

In [50]:
# 3 사분위 + 1.5 IQR
upper_bound_age = df['age'].quantile(0.75) + 1.5 * iqr_age
upper_bound_age

64.8125

In [52]:
lower_bound_age = df['age'].quantile(0.25) - 1.5 * iqr_age
lower_bound_age # 0보다 작은 값이라 무의미 하긴 하다

-6.6875

In [56]:
# 문제: age 컬럼에서 아웃라이어 값 출력
df.loc[(df['age'] > upper_bound_age) | (df['age'] < lower_bound_age) , 'age']

33     66.0
54     65.0
96     71.0
116    70.5
280    65.0
456    65.0
493    71.0
630    80.0
672    70.0
745    70.0
851    74.0
Name: age, dtype: float64

In [59]:
# fare 아웃라이어
iqr_fare = abs(df['fare'].quantile(0.25) - df['fare'].quantile(0.75))
upper_bound_age = df['fare'].quantile(0.75) + 1.5 * iqr_fare
lower_bound_age = df['fare'].quantile(0.25) - 1.5 * iqr_fare
df.loc[(df['fare'] > upper_bound_age) | (df['fare'] < lower_bound_age) , 'fare']

1       71.2833
27     263.0000
31     146.5208
34      82.1708
52      76.7292
         ...   
846     69.5500
849     89.1042
856    164.8667
863     69.5500
879     83.1583
Name: fare, Length: 116, dtype: float64

- unique

In [70]:
df['who'].unique()

array(['man', 'woman', 'child'], dtype=object)

In [75]:
df['who'].nunique()

3

- 최빈값

In [71]:
# 최빈값: 가장 빈도수가 높은값
df['who'].mode()

0    man
Name: who, dtype: object

In [72]:
df['deck'].mode()

0    C
Name: deck, dtype: category
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']

- 상관관계
* corr이 0.3 정도면 어느정도 상관관계가 있다고 보고 0.7 정도 되면 상관관계가 크다고 본다.

In [73]:
df.corr()

  df.corr()


Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
survived,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307,-0.55708,-0.203367
pclass,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495,0.094035,0.135207
age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067,0.280328,0.19827
sibsp,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651,-0.253586,-0.584471
parch,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225,-0.349943,-0.583398
fare,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0,-0.182024,-0.271832
adult_male,-0.55708,0.094035,0.280328,-0.253586,-0.349943,-0.182024,1.0,0.404744
alone,-0.203367,0.135207,0.19827,-0.584471,-0.583398,-0.271832,0.404744,1.0


In [76]:
df['survived'].corr(df['adult_male']) # 음의 상관관계

-0.5570800422053258

In [78]:
df.corr()['survived'] # pclass가 안좋아질 수록 생존확률이 적음, 남성일 수록 생존확률이 적음

  df.corr()['survived']


survived      1.000000
pclass       -0.338481
age          -0.077221
sibsp        -0.035322
parch         0.081629
fare          0.257307
adult_male   -0.557080
alone        -0.203367
Name: survived, dtype: float64

In [83]:
df.corr().columns

  df.corr().columns


Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'adult_male',
       'alone'],
      dtype='object')

In [101]:
# 강사님
df_corr = df.corr()

for col in df_corr.columns:
    print(col)
    print(df_corr[col][(df_corr[col] > 0.3) & (df_corr[col] < 0.9999)])
    print('-'*50)

survived
Series([], Name: survived, dtype: float64)
--------------------------------------------------
pclass
Series([], Name: pclass, dtype: float64)
--------------------------------------------------
age
Series([], Name: age, dtype: float64)
--------------------------------------------------
sibsp
parch    0.414838
Name: sibsp, dtype: float64
--------------------------------------------------
parch
sibsp    0.414838
Name: parch, dtype: float64
--------------------------------------------------
fare
Series([], Name: fare, dtype: float64)
--------------------------------------------------
adult_male
alone    0.404744
Name: adult_male, dtype: float64
--------------------------------------------------
alone
adult_male    0.404744
Name: alone, dtype: float64
--------------------------------------------------


  df_corr = df.corr()


In [None]:
# 내가 한거 
cols = list(df.corr().columns)
for col1 in cols:
    print(col1)
    for col2 in cols:
        if df.corr()[col1][col2] == 1:
            continue
        elif df.corr()[col1][col2] > 0.3:

            print(round(df.corr()[col1][col2], 6))
        else:
            continue
    print('-------------------------')

In [104]:
# 음의 상관관계
df_corr = df.corr()

for col in df_corr.columns:
    print(col)
    print(df_corr.loc[df_corr[col] < -0.3, col])
    print('-'*50)

survived
pclass       -0.338481
adult_male   -0.557080
Name: survived, dtype: float64
--------------------------------------------------
pclass
survived   -0.338481
age        -0.369226
fare       -0.549500
Name: pclass, dtype: float64
--------------------------------------------------
age
pclass   -0.369226
sibsp    -0.308247
Name: age, dtype: float64
--------------------------------------------------
sibsp
age     -0.308247
alone   -0.584471
Name: sibsp, dtype: float64
--------------------------------------------------
parch
adult_male   -0.349943
alone        -0.583398
Name: parch, dtype: float64
--------------------------------------------------
fare
pclass   -0.5495
Name: fare, dtype: float64
--------------------------------------------------
adult_male
survived   -0.557080
parch      -0.349943
Name: adult_male, dtype: float64
--------------------------------------------------
alone
sibsp   -0.584471
parch   -0.583398
Name: alone, dtype: float64
-----------------------------------

  df_corr = df.corr()


- groupby

In [105]:
# who 컬럼을 기준으로 그룹바이 -> 평균
df.groupby('who').mean()

  df.groupby('who').mean()


Unnamed: 0_level_0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
who,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
child,0.590361,2.626506,6.369518,1.73494,1.26506,32.785795,0.0,0.072289
man,0.163873,2.372439,33.173123,0.296089,0.1527,24.864182,1.0,0.763501
woman,0.756458,2.084871,32.0,0.601476,0.564576,46.570711,0.0,0.446494


In [107]:
df.groupby('who')['survived'].mean()

who
child    0.590361
man      0.163873
woman    0.756458
Name: survived, dtype: float64

In [108]:
df.groupby('who').mean()['survived'] # 위의 코드와 동일하지만 위에 코드가 더 빠르다.

  df.groupby('who').mean()['survived']


who
child    0.590361
man      0.163873
woman    0.756458
Name: survived, dtype: float64

In [109]:
df.groupby('who')['survived', 'age', 'pclass'].mean()

  df.groupby('who')['survived', 'age', 'pclass'].mean()


Unnamed: 0_level_0,survived,age,pclass
who,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
child,0.590361,6.369518,2.626506
man,0.163873,33.173123,2.372439
woman,0.756458,32.0,2.084871


In [111]:
df.groupby(['who','pclass']).mean()

  df.groupby(['who','pclass']).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,survived,age,sibsp,parch,fare,adult_male,alone
who,pclass,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
child,1,0.833333,7.82,0.666667,1.833333,139.382633,0.0,0.0
child,2,1.0,4.543684,0.789474,1.263158,28.323905,0.0,0.0
child,3,0.431034,6.817586,2.155172,1.206897,23.22019,0.0,0.103448
man,1,0.352941,42.382653,0.302521,0.235294,65.951086,1.0,0.630252
man,2,0.080808,33.588889,0.292929,0.131313,19.054124,1.0,0.727273
man,3,0.119122,28.995556,0.294671,0.128527,11.340213,1.0,0.824451
woman,1,0.978022,35.5,0.549451,0.417582,104.317995,0.0,0.373626
woman,2,0.909091,32.179688,0.454545,0.5,20.868624,0.0,0.484848
woman,3,0.491228,27.854167,0.72807,0.719298,15.354351,0.0,0.482456


In [113]:
df.groupby(['who','pclass'])[['survived']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,survived
who,pclass,Unnamed: 2_level_1
child,1,0.833333
child,2,1.0
child,3,0.431034
man,1,0.352941
man,2,0.080808
man,3,0.119122
woman,1,0.978022
woman,2,0.909091
woman,3,0.491228


- pivot table

In [114]:
# 행 기준(index): who, values: survived, agg: mean(디폴트)
df.pivot_table(index='who', values='survived')
# == df.groupby('who')['survived'].mean()

Unnamed: 0_level_0,survived
who,Unnamed: 1_level_1
child,0.590361
man,0.163873
woman,0.756458


In [115]:
# 열 기준(columns): who, values: survived, agg: mean(디폴트)
df.pivot_table(columns='who', values='survived')

who,child,man,woman
survived,0.590361,0.163873,0.756458


In [116]:
df.pivot_table(index='who', columns='pclass', values='survived')

pclass,1,2,3
who,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
child,0.833333,1.0,0.431034
man,0.352941,0.080808,0.119122
woman,0.978022,0.909091,0.491228


In [117]:
df.pivot_table(index=['who','pclass'], values='survived')

Unnamed: 0_level_0,Unnamed: 1_level_0,survived
who,pclass,Unnamed: 2_level_1
child,1,0.833333
child,2,1.0
child,3,0.431034
man,1,0.352941
man,2,0.080808
man,3,0.119122
woman,1,0.978022
woman,2,0.909091
woman,3,0.491228


In [120]:
# 다중 통계 적용
df.pivot_table(index='who', columns='pclass', values='survived', aggfunc=['mean', 'sum'])

Unnamed: 0_level_0,mean,mean,mean,sum,sum,sum
pclass,1,2,3,1,2,3
who,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
child,0.833333,1.0,0.431034,5,19,25
man,0.352941,0.080808,0.119122,42,8,38
woman,0.978022,0.909091,0.491228,89,60,56


In [121]:
# 다중 통계 적용
df.pivot_table(index='who', columns='pclass', values='survived', aggfunc=['mean', 'sum', 'count'])

# 아이이면서 3등급 칸에 탄 58 명 중 25 명이 살았다

Unnamed: 0_level_0,mean,mean,mean,sum,sum,sum,count,count,count
pclass,1,2,3,1,2,3,1,2,3
who,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
child,0.833333,1.0,0.431034,5,19,25,6,19,58
man,0.352941,0.080808,0.119122,42,8,38,119,99,319
woman,0.978022,0.909091,0.491228,89,60,56,91,66,114
