# 통계관련 (최대값, 최소값, ...)

In [2]:
import pandas as pd
import seaborn as sns

df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## describe()

In [3]:
df.describe()
# 통계정보를 한 눈에 볼 수 있고 숫자로만 된 정보를 보여준다

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
df.describe(include='object')

Unnamed: 0,sex,embarked,who,embark_town,alive
count,891,889,891,889,891
unique,2,3,3,3,2
top,male,S,man,Southampton,no
freq,577,644,537,644,549


### count()

In [6]:
df.count()
# null, NaN 값은 당연히 빠진다

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

In [7]:
df['age'].count()

np.int64(714)

### mean()
- 평균을 계산해준다

In [10]:
df['age'].mean() # 숫자만 포함되어야 한다

np.float64(29.69911764705882)

In [11]:
# 글자도 포함되어있지만 숫자만 골라서 평균내달라고 하려면
df.mean(numeric_only=True)

survived       0.383838
pclass         2.308642
age           29.699118
sibsp          0.523008
parch          0.381594
fare          32.204208
adult_male     0.602694
alone          0.602694
dtype: float64

- 근데 describe로 반환시키면 다 나오긴 한다..

In [12]:
# 성인 남성 요금 평균
cond = df['adult_male'] == True
df.loc[cond, 'fare'].mean()

np.float64(24.864181750465548)

In [23]:
# fare 30 ~ 40
cond1 = df['fare'] <= 40
cond2 = df['fare'] >= 30
cond3 = df['pclass'] == 1
df.loc[cond1 & cond2 & cond3, 'age'].mean()

# 그리고 mean(skipna=True)이라고 n/a값을 빼고 계산하는 명령어도 있으나 현재로서는 결과가 같아서 주석처리 했슴

np.float64(44.095238095238095)

### median()
- 중앙값 : 5개의 데이터중에서 딱 중앙에 있는 3번째 값이 중앙값이다

In [26]:
pd.Series([1, 2, 10, 100, 1000]).mean()

np.float64(222.6)

In [27]:
pd.Series([1, 2, 10, 100, 1000]).median()

np.float64(10.0)

In [29]:
pd.Series([1, 2, 10, 11, 100, 1000]).median() # 데이터가 짝수개이면 중앙의 10과 11의 평균을 반환한다

np.float64(10.5)

In [33]:
mean_value = df['age'].mean()
median_value = df['age'].median()
print(mean_value, '', median_value)

29.69911764705882  28.0


### sum()

In [34]:
df.sum(numeric_only=True)

survived        342.0000
pclass         2057.0000
age           21205.1700
sibsp           466.0000
parch           340.0000
fare          28693.9493
adult_male      537.0000
alone           537.0000
dtype: float64

In [35]:
df['fare'].sum()

np.float64(28693.9493)

In [38]:
df['fare'].cumsum() # 경과를 보여주는데 쓸일은 없다.... comprod:곱의 경과

0          7.2500
1         78.5333
2         86.4583
3        139.5583
4        147.6083
          ...    
886    28602.7493
887    28632.7493
888    28656.1993
889    28686.1993
890    28693.9493
Name: fare, Length: 891, dtype: float64

### Var(분산)

In [39]:
fare_mean = df['fare'].mean()
fare_mean

np.float64(32.204207968574636)

In [40]:
df['fare']

0       7.2500
1      71.2833
2       7.9250
3      53.1000
4       8.0500
        ...   
886    13.0000
887    30.0000
888    23.4500
889    30.0000
890     7.7500
Name: fare, Length: 891, dtype: float64

In [52]:
total = ((df['fare'] - fare_mean) ** 2).sum()
total_count = df['fare'].count() - 1

my_var = total / total_count
print(my_var)

2469.436845743116


### std()

In [3]:
import numpy as np

In [61]:
np.sqrt(df['fare'].var())

np.float64(49.6934285971809)

In [55]:
df['fare'].std()

np.float64(49.6934285971809)

### min(), max()

In [56]:
df['age'].min()

np.float64(0.42)

In [57]:
df['age'].max()

np.float64(80.0)

## agg() agrregation

In [62]:
df['age'].agg(['max', 'min', 'count', 'mean'])

max       80.000000
min        0.420000
count    714.000000
mean      29.699118
Name: age, dtype: float64

In [63]:
df[['age', 'fare']].agg(['max', 'min'])

Unnamed: 0,age,fare
max,80.0,512.3292
min,0.42,0.0


## quantile()
- 몇분위에 있는가

In [64]:
df['age'].quantile(0.1) # 앞에서부터 10% 이므로 나이기준 하위 10% - 어린사람

np.float64(14.0)

In [66]:
df['age'].quantile(0.9) # 나이 많은 기준 상위 10%

np.float64(50.0)

In [67]:
df['age'].median()

np.float64(28.0)

In [68]:
df['age'].quantile(0.5)

np.float64(28.0)

## unique()

In [70]:
df['who'].unique() # 값이 제한될 것으로 예상되는 상황에서 사용

array(['man', 'woman', 'child'], dtype=object)

In [72]:
df['who'].nunique() # 유니크값의 갯수

3

## mode()
- 최빈값

In [74]:
df['who'].mode()

0    man
Name: who, dtype: object

In [75]:
df['deck'].mode()

0    C
Name: deck, dtype: category
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']

## corr()
- 각 컬럼끼리의 상관관계를 -1 ~ 1 까지로 표현
- 1에 가까울 수록 상관관계가 높고 관계는 비례한다고 볼 수 있다
- -1에 가까워도 상관관계가 높으나 반비례한다고 보면 된다

In [81]:
df.corr(numeric_only=True)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
survived,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307,-0.55708,-0.203367
pclass,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495,0.094035,0.135207
age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067,0.280328,0.19827
sibsp,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651,-0.253586,-0.584471
parch,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225,-0.349943,-0.583398
fare,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0,-0.182024,-0.271832
adult_male,-0.55708,0.094035,0.280328,-0.253586,-0.349943,-0.182024,1.0,0.404744
alone,-0.203367,0.135207,0.19827,-0.584471,-0.583398,-0.271832,0.404744,1.0


In [4]:
df.corr(numeric_only=True)['survived']

survived      1.000000
pclass       -0.338481
age          -0.077221
sibsp        -0.035322
parch         0.081629
fare          0.257307
adult_male   -0.557080
alone        -0.203367
Name: survived, dtype: float64