In [1]:
from bs4 import BeautifulSoup as bs
from numpy import nan
import seaborn as sns
import pandas as pd
import requests
from time import sleep
import matplotlib.pyplot as plt
import warnings
import folium
import datetime

#그래프 한글 오류 해결
from matplotlib import font_manager, rc

font_path = 'malgun.ttf'
font_name = font_manager.FontProperties(fname = font_path).get_name()
rc('font', family = font_name)

#경고 메시지 무시
warnings.filterwarnings("ignore")

#마이너스 부호 유니코드
plt.style.use('ggplot')
plt.rcParams['axes.unicode_minus']=False

In [2]:
titanic = sns.load_dataset('titanic')
titanic.head(2)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False


In [13]:
df = titanic.loc[:,['age','sex','class','fare','survived']]
df.head(3)

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
1,38.0,female,First,71.2833,1
2,26.0,female,Third,7.925,1


In [18]:
list(df.groupby('sex')) #해당 열의 값들을 종목화하여 만든다. 리스트 함수를 이용하여로 리스트로 만들수있다

# numeric_only = False(기본값) : True할 시 수치를 가진 값들만 계산하여 평균값을 도출한다.
grouped = df.groupby('class')
grouped.mean(numeric_only = True)

Unnamed: 0_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,38.233441,84.154687,0.62963
Second,29.87763,20.662183,0.472826
Third,25.14062,13.67555,0.242363


In [16]:
for key, group in grouped:    #for문 변수 2개 입력방법
    print('*key:',key)
    print('*number:',len(group))
    print(group.head(3))

*key: First
*number: 216
    age     sex  class     fare  survived
1  38.0  female  First  71.2833         1
3  35.0  female  First  53.1000         1
6  54.0    male  First  51.8625         0
*key: Second
*number: 184
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
17   NaN    male  Second  13.0000         1
*key: Third
*number: 491
    age     sex  class   fare  survived
0  22.0    male  Third  7.250         0
2  26.0  female  Third  7.925         1
4  35.0    male  Third  8.050         0


In [20]:
mask1 = df['class'] == 'First'
mask2 = df['class'] == 'Second'
mask3 = df['class'] == 'Third'

df1 = df[mask1].mean(numeric_only = True)
df2 = df[mask2].mean(numeric_only = True)
df3 = df[mask3].mean(numeric_only = True)

pd.DataFrame({'First':df1, 'Second':df2, 'Third':df3})
pd.DataFrame([df1,df2,df3], index = ['First','Second','Third'])

Unnamed: 0,First,Second,Third
age,38.233441,29.87763,25.14062
fare,84.154687,20.662183,13.67555
survived,0.62963,0.472826,0.242363


In [22]:
grouped2 = df.groupby(['class','sex'])
for key, group in grouped2:
    print('*key:',key)
    print('*number:',len(group))
    print(group.head(3))

*key: ('First', 'female')
*number: 94
     age     sex  class     fare  survived
1   38.0  female  First  71.2833         1
3   35.0  female  First  53.1000         1
11  58.0  female  First  26.5500         1
*key: ('First', 'male')
*number: 122
     age   sex  class      fare  survived
6   54.0  male  First   51.8625         0
23  28.0  male  First   35.5000         1
27  19.0  male  First  263.0000         0
*key: ('Second', 'female')
*number: 76
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
41  27.0  female  Second  21.0000         0
*key: ('Second', 'male')
*number: 108
     age   sex   class  fare  survived
17   NaN  male  Second  13.0         1
20  35.0  male  Second  26.0         0
21  34.0  male  Second  13.0         1
*key: ('Third', 'female')
*number: 144
     age     sex  class     fare  survived
2   26.0  female  Third   7.9250         1
8   27.0  female  Third  11.1333         1
10   4.0

In [24]:
grouped2.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,fare,survived
class,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
First,female,34.611765,106.125798,0.968085
First,male,41.281386,67.226127,0.368852
Second,female,28.722973,21.970121,0.921053
Second,male,30.740707,19.741782,0.157407
Third,female,21.75,16.11881,0.5
Third,male,26.507589,12.661633,0.135447


In [26]:
grouped = df.groupby(['class'])
grouped.std(numeric_only=True)  #std 표준편차 집계

Unnamed: 0_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,14.802856,78.380373,0.484026
Second,14.001077,13.417399,0.500623
Third,12.495398,11.778142,0.428949


In [27]:
grouped[['age','fare']].std()

Unnamed: 0_level_0,age,fare
class,Unnamed: 1_level_1,Unnamed: 2_level_1
First,14.802856,78.380373
Second,14.001077,13.417399
Third,12.495398,11.778142


In [35]:
def min_max(x):
    return x.max() - x.min()

grouped[['age','fare']].agg(min_max)  
#agg 함수 : 여러개의 함수를 제공해준다.
# agg({'컬럼':'함수', '컬럼':'함수'})
# agg({'컬럼':[함수1, 함수2], '컬럼':[함수1, 함수2]})

Unnamed: 0_level_0,age,fare
class,Unnamed: 1_level_1,Unnamed: 2_level_1
First,79.08,512.3292
Second,69.33,73.5
Third,73.58,69.55


In [36]:
grouped[['age','fare']].agg([max,min,min_max])

Unnamed: 0_level_0,age,age,age,fare,fare,fare
Unnamed: 0_level_1,max,min,min_max,max,min,min_max
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
First,80.0,0.92,79.08,512.3292,0.0,512.3292
Second,70.0,0.67,69.33,73.5,0.0,73.5
Third,74.0,0.42,73.58,69.55,0.0,69.55


In [38]:
grouped[['age','fare']].agg({'age':['mean','std'],'fare':['min','max']})

Unnamed: 0_level_0,age,age,fare,fare
Unnamed: 0_level_1,mean,std,min,max
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
First,38.233441,14.802856,0.0,512.3292
Second,29.87763,14.001077,0.0,73.5
Third,25.14062,12.495398,0.0,69.55


In [49]:
grouped = df.groupby('class')
for key, group in grouped:
    print(key, len(group))

    
#그룹핑객체.filter(조건함수) : 조건에 부합되는 데이터 프레임들을 결합하여 1개의 데이터 프레임

grouped_filter = grouped.filter(lambda df: len(df) > 200)
grouped_filter

First 216
Second 184
Third 491


Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.2500,0
1,38.0,female,First,71.2833,1
2,26.0,female,Third,7.9250,1
3,35.0,female,First,53.1000,1
4,35.0,male,Third,8.0500,0
...,...,...,...,...,...
885,39.0,female,Third,29.1250,0
887,19.0,female,First,30.0000,1
888,,female,Third,23.4500,0
889,26.0,male,First,30.0000,1


In [50]:
#filter(함수)는 데이터프레임 객체를 전달받아 True / False로 반환해야한다.
def over_200(df):
    return len(df) > 200

grouped.filter(over_200)

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.2500,0
1,38.0,female,First,71.2833,1
2,26.0,female,Third,7.9250,1
3,35.0,female,First,53.1000,1
4,35.0,male,Third,8.0500,0
...,...,...,...,...,...
885,39.0,female,Third,29.1250,0
887,19.0,female,First,30.0000,1
888,,female,Third,23.4500,0
889,26.0,male,First,30.0000,1


In [53]:
grouped.apply(lambda df: df.describe())

Unnamed: 0_level_0,Unnamed: 1_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
First,count,186.0,216.0,216.0
First,mean,38.233441,84.154687,0.62963
First,std,14.802856,78.380373,0.484026
First,min,0.92,0.0,0.0
First,25%,27.0,30.92395,0.0
First,50%,37.0,60.2875,1.0
First,75%,49.0,93.5,1.0
First,max,80.0,512.3292,1.0
Second,count,173.0,184.0,184.0
Second,mean,29.87763,20.662183,0.472826


In [56]:
print(grouped['age'].mean())

grouped.filter(lambda df: df['age'].mean() < 30)

class
First     38.233441
Second    29.877630
Third     25.140620
Name: age, dtype: float64


Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.2500,0
2,26.0,female,Third,7.9250,1
4,35.0,male,Third,8.0500,0
5,,male,Third,8.4583,0
7,2.0,male,Third,21.0750,0
...,...,...,...,...,...
884,25.0,male,Third,7.0500,0
885,39.0,female,Third,29.1250,0
886,27.0,male,Second,13.0000,0
888,,female,Third,23.4500,0


In [63]:
age_filter = grouped.apply(lambda df: df['age'].mean() < 30)

df_list = []
for x in age_filter.index:
    if age_filter[x] == True:
        age_filter_df = grouped.get_group(x)
        #print('*key:', x)
        #print(age_filter_df.head(3))
        df_list.append(age_filter_df)
        #print()
        
pd.concat(df_list)

*key: Second
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
17   NaN    male  Second  13.0000         1

*key: Third
    age     sex  class   fare  survived
0  22.0    male  Third  7.250         0
2  26.0  female  Third  7.925         1
4  35.0    male  Third  8.050         0



Unnamed: 0,age,sex,class,fare,survived
9,14.0,female,Second,30.0708,1
15,55.0,female,Second,16.0000,1
17,,male,Second,13.0000,1
20,35.0,male,Second,26.0000,0
21,34.0,male,Second,13.0000,1
...,...,...,...,...,...
882,22.0,female,Third,10.5167,0
884,25.0,male,Third,7.0500,0
885,39.0,female,Third,29.1250,0
888,,female,Third,23.4500,0


In [64]:
titanic2 = titanic.copy()
titanic2

print(len(titanic2) - titanic2.count())
print(titanic2.isnull().sum())

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


In [75]:
who_mean_age = titanic2.groupby('who')['age'].mean()




mask = titanic2['who'] == 'child'
df = titanic2[mask]
df['age'].fillna( who_mean_age['child'])

mask = titanic2['who'] == 'child'
df = titanic2[mask]
df['age'].fillna(who_mean_age['child'])

7       2.00
9      14.00
10      4.00
14     14.00
16      2.00
       ...  
831     0.83
850     4.00
852     9.00
869     4.00
875    15.00
Name: age, Length: 83, dtype: float64

In [85]:
df_list = []
for who in who_mean_age.index:
    mask = titanic2['who'] == who
    df = titanic2[mask]
    df['age'].fillna( who_mean_age[who])
    df_list.append(df)
dfs = pd.concat(df_list)
dfs.sort_index(inplace = True)
ori_age = titanic2['age']
dfs.insert(4, 'ori_age', ori_age)
dfs[20:50]

Unnamed: 0,survived,pclass,sex,age,ori_age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
20,0,2,male,35.0,35.0,0,0,26.0,S,Second,man,True,,Southampton,no,True
21,1,2,male,34.0,34.0,0,0,13.0,S,Second,man,True,D,Southampton,yes,True
22,1,3,female,15.0,15.0,0,0,8.0292,Q,Third,child,False,,Queenstown,yes,True
23,1,1,male,28.0,28.0,0,0,35.5,S,First,man,True,A,Southampton,yes,True
24,0,3,female,8.0,8.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
25,1,3,female,38.0,38.0,1,5,31.3875,S,Third,woman,False,,Southampton,yes,False
26,0,3,male,,,0,0,7.225,C,Third,man,True,,Cherbourg,no,True
27,0,1,male,19.0,19.0,3,2,263.0,S,First,man,True,C,Southampton,no,False
28,1,3,female,,,0,0,7.8792,Q,Third,woman,False,,Queenstown,yes,True
29,0,3,male,,,0,0,7.8958,S,Third,man,True,,Southampton,no,True


In [81]:
titanic2[20:50]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
20,0,2,male,35.0,0,0,26.0,S,Second,man,True,,Southampton,no,True
21,1,2,male,34.0,0,0,13.0,S,Second,man,True,D,Southampton,yes,True
22,1,3,female,15.0,0,0,8.0292,Q,Third,child,False,,Queenstown,yes,True
23,1,1,male,28.0,0,0,35.5,S,First,man,True,A,Southampton,yes,True
24,0,3,female,8.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
25,1,3,female,38.0,1,5,31.3875,S,Third,woman,False,,Southampton,yes,False
26,0,3,male,,0,0,7.225,C,Third,man,True,,Cherbourg,no,True
27,0,1,male,19.0,3,2,263.0,S,First,man,True,C,Southampton,no,False
28,1,3,female,,0,0,7.8792,Q,Third,woman,False,,Queenstown,yes,True
29,0,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True
