In [2]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_moons, make_circles

import math
import statistics
from collections import Counter

In [3]:
table = pd.read_csv('data_pregnant.csv')

# A. 임산부석은 비어있는가?

In [4]:
# 전체 데이터 개수: 833개
len(table)

834

In [5]:
table.head(1)

Unnamed: 0,date,time,line,start,stop,count_station,time.1,crowdedness,E,N,...,cellphone,thinking,sleep,earphone,talking,calling,reading,makeup,eating,writing
0,2019. 7. 31.,19:10:00,5.0,공덕,여의도,3.0,6.0,N,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


### 임산부석에 사람이 앉아있는 경우의 수와 비율: 576, 69.15%
### 임산부석이 비어있는 경우의 수와 비율: 257, 30.85%

In [6]:
table['is_empty'].value_counts()

0.0    576
1.0    257
Name: is_empty, dtype: int64

In [7]:
len(table[table['is_empty']==0]) / len(table) * 100

69.06474820143885

In [8]:
len(table[table['is_empty']==1]) / len(table) * 100

30.815347721822544

### 임산부석 착석자가 임산부일 경우의 수와 비율: 26, 4.51% 

In [9]:
# full: 임산부석에 착석자가 있는 행만 뽑은 테이블
full = table[table['is_empty']==0]

print(len(full[full['pregnant']==1]))
print(len(full[full['pregnant']==1]) / len(full) * 100)

26
4.513888888888888


# 2. 임산부석 착석자 중 비임산부의 특징

In [10]:
# not_p: 임산부석 착석자가 비임산부인 행만 뽑은 테이블
not_p = full[full['pregnant']==0]

### 임산부석 착석자가 비임산부일 경우의 수와 비율: 550, 95.49%

In [11]:
len(not_p)

550

In [12]:
len(not_p)/len(full) * 100

95.48611111111111

## 1) 비임산부 착석자 성별

### 여성이 앉은 경우의 수와 비율: 422, 76.73%
### 남성이 앉은 경우의 수와 비율: 128, 23.28%

In [13]:
print(len(not_p[not_p['female']==1]))
print(len(not_p[not_p['female']==1])/len(not_p)*100)
print(len(not_p[not_p['male']==1]))
print(len(not_p[not_p['male']==1])/len(not_p)*100)

422
76.72727272727272
128
23.272727272727273


## 2) 비임산부 착석자 나이

In [14]:
not_p_age_num = not_p.groupby('age')['date'].count()
not_p_age_per =not_p_age_num.apply(lambda x: x / len(not_p) * 100).round(2)
not_p_age_df = pd.DataFrame([not_p_age_num, not_p_age_per]).T
not_p_age_df.columns = [['개수', '비율(%)']]
not_p_age_df['개수'] = not_p_age_df['개수'].astype(int)
not_p_age_df.index.name = '연령대'
not_p_age_df.reset_index()

TypeError: only integer scalar arrays can be converted to a scalar index

## 3) 비임산부 착석자가 무거운 짐을 갖고 있는가?

In [15]:
not_p_heavy_num = not_p['heavy'].value_counts()
not_p_heavy_per = not_p_heavy_num.apply(lambda x: x / len(not_p) * 100).round(2)
not_p_heavy_df = pd.DataFrame([not_p_heavy_num, not_p_heavy_per]).T
not_p_heavy_df.columns = [['개수', '비율(%)']]
not_p_heavy_df = not_p_heavy_df.set_index(pd.Series(['가볍다', '무겁다']))
not_p_heavy_df['개수'] = not_p_heavy_df['개수'].astype(int)
not_p_heavy_df.index.name = '짐'
not_p_heavy_df.reset_index()

TypeError: only integer scalar arrays can be converted to a scalar index

## 3-1) 나이와 짐 무게는 관계가 있을까?

### 연령층 별로 무거운 짐을 가지고 있는 비율

In [16]:
heavy_age_dict = {'노년':'old_age', '중년':'middle_age', '청년':'youth', '청소년':'teenage', '어린이':'child'}
for key in heavy_age_dict.keys():
    print(key, ":", round(len(not_p[(not_p[heavy_age_dict[key]]==1) & (not_p['heavy']==1)]) / len(not_p[(not_p[heavy_age_dict[key]]==1)])*100,2), "%")

노년 : 34.0 %
중년 : 33.33 %
청년 : 35.15 %
청소년 : 50.0 %
어린이 : 9.09 %


In [17]:
pd.DataFrame(not_p.groupby(['age','heavy'])['date'].size())

Unnamed: 0_level_0,Unnamed: 1_level_0,date
age,heavy,Unnamed: 2_level_1
노년,0.0,65
노년,1.0,34
어린이,0.0,10
어린이,1.0,1
중년,0.0,154
중년,1.0,77
청년,0.0,131
청년,1.0,71
청소년,0.0,3
청소년,1.0,3


## 4) 비임산부 착석자는 임산부석에 앉아서 뭘 할까?

In [18]:
# 행동 데이터가 기록되지 않은 19개의 행이 있다.
# 결측치 일단 제외
len(not_p[not_p['behavior1'].isnull()])

19

In [19]:
# 1가지 이상의 행동을 하고 있는 사람의 수: 531명
# 2가지 이상의 행동을 하고 있는 사람의 수: 59명
# 3가지 이상의 행동을 하고 있는 사람의 수: 1명
# 2가지 또는 3가지의 행동을 동시에 하고 있는 사람을 각각 2명 또는 3명으로 간주하고 60개의 행을 추가할 예정
print(len(not_p[~not_p['behavior1'].isnull()]))
print(len(not_p[~not_p['behavior2'].isnull()]))
print(len(not_p[~not_p['behavior3'].isnull()]))

531
59
1


In [20]:
not_p.columns

Index(['date', 'time', 'line', 'start', 'stop', 'count_station', 'time.1',
       'crowdedness', 'E', 'N', 'F', 'old_age_seat', 'is_empty', 'empty_count',
       'female', 'male', 'age', 'child', 'teenage', 'youth', 'middle_age',
       'old_age', 'badge', 'pregnant', 'heavy', 'behavior1', 'behavior2',
       'behavior3', 'cellphone', 'thinking', 'sleep', 'earphone', 'talking',
       'calling', 'reading', 'makeup', 'eating', 'writing'],
      dtype='object')

In [21]:
behavior1_df = not_p[~not_p['behavior1'].isnull()].loc[:,'date':'behavior1']
behavior1_df = behavior1_df.rename(columns = {'behavior1':'behavior'})
behavior2_df = not_p[~not_p['behavior2'].isnull()].loc[:,'date':'behavior2'].drop(['behavior1'], axis=1)
behavior2_df = behavior2_df.rename(columns = {'behavior2':'behavior'})
behavior3_df = not_p[~not_p['behavior3'].isnull()].loc[:,'date':'behavior3'].drop(['behavior1','behavior2'], axis=1)
behavior3_df = behavior3_df.rename(columns = {'behavior3':'behavior'})
behavior3_df

Unnamed: 0,date,time,line,start,stop,count_station,time.1,crowdedness,E,N,...,age,child,teenage,youth,middle_age,old_age,badge,pregnant,heavy,behavior
211,2019. 8. 5.,18:50:00,11.0,공덕,회기,10.0,20.0,F,0.0,0.0,...,중년,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,잠


In [22]:
behavior_df = behavior1_df.append(behavior2_df).append(behavior3_df)
behavior_df.tail()
#len(behavior_df)

Unnamed: 0,date,time,line,start,stop,count_station,time.1,crowdedness,E,N,...,age,child,teenage,youth,middle_age,old_age,badge,pregnant,heavy,behavior
778,2019. 8. 16.,9:25:00,5.0,신길,공덕,4.0,8.0,N,0.0,1.0,...,중년,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,먹기
779,2019. 8. 16.,9:25:00,5.0,신길,공덕,4.0,8.0,N,0.0,1.0,...,중년,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,이어폰
786,2019. 8. 16.,18:14:00,1.0,공덕,병점,19.0,38.0,N,0.0,1.0,...,청년,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,핸드폰
812,2019. 8. 16.,17:58:00,6.0,공덕,삼각지,2.0,4.0,N,0.0,1.0,...,중년,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,멍
211,2019. 8. 5.,18:50:00,11.0,공덕,회기,10.0,20.0,F,0.0,0.0,...,중년,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,잠


In [23]:
# 비율은 원래 전체 인원 수인 531로 나눠 구함 (total>100)
not_p_behavior_num = behavior_df.groupby('behavior')['date'].size().sort_values(ascending=False)
not_p_behavior_per = not_p_behavior_num.apply(lambda x: x / len(behavior1_df) *100).round(2).sort_values(ascending=False)
not_p_behavior_df = pd.DataFrame([not_p_behavior_num, not_p_behavior_per]).T
not_p_behavior_df.columns = [['개수', '비율(%)']]
not_p_behavior_df['개수'] = not_p_behavior_df['개수'].astype(int)
not_p_behavior_df.index.name = '행동'
not_p_behavior_df.reset_index()

TypeError: only integer scalar arrays can be converted to a scalar index

## 4-1) 나이에 따라 하는 행동이 다를까?

### 노년층의 행동

In [24]:
behavior_old_age = pd.DataFrame(behavior_df[behavior_df['old_age']==1].groupby('behavior')['date'].size()).rename(columns = {'date':'인원(명)'})
behavior_old_age['비율(%)'] = round(behavior_old_age['인원(명)']/len(behavior_df[behavior_df['old_age']==1]) * 100, 2)
behavior_old_age.index_name = '행동'
behavior_old_age.

SyntaxError: invalid syntax (<ipython-input-24-1d56a15f2395>, line 4)

### 중년층의 행동

In [25]:
behavior_middle_age = pd.DataFrame(behavior_df[behavior_df['middle_age']==1].groupby('behavior')['date'].size()).rename(columns = {'date':'인원(명)'})
behavior_middle_age['비율(%)'] = round(behavior_middle_age['인원(명)']/len(behavior_df[behavior_df['middle_age']==1]) * 100, 2)
behavior_middle_age.index_name = '행동'
behavior_middle_age.sort_values(by='인원(명)', ascending=False).reset_index()

Unnamed: 0,behavior,인원(명),비율(%)
0,핸드폰,123,50.41
1,멍,41,16.8
2,잠,37,15.16
3,이어폰,20,8.2
4,대화,14,5.74
5,독서,3,1.23
6,통화,3,1.23
7,먹기,2,0.82
8,필기,1,0.41


### 청년층의 행동

In [26]:
behavior_youth = pd.DataFrame(behavior_df[behavior_df['youth']==1].groupby('behavior')['date'].size()).rename(columns = {'date':'인원(명)'})
behavior_youth['비율(%)'] = round(behavior_youth['인원(명)']/len(behavior_df[behavior_df['youth']==1]) * 100, 2)
behavior_youth.index_name = '행동'
behavior_youth.sort_values(by='인원(명)', ascending=False).reset_index()

Unnamed: 0,behavior,인원(명),비율(%)
0,핸드폰,137,60.09
1,잠,32,14.04
2,이어폰,31,13.6
3,멍,12,5.26
4,대화,6,2.63
5,통화,3,1.32
6,화장,3,1.32
7,먹기,2,0.88
8,독서,1,0.44
9,필기,1,0.44


### 청소년의 행동

In [27]:
behavior_teenage = pd.DataFrame(behavior_df[behavior_df['teenage']==1].groupby('behavior')['date'].size()).rename(columns = {'date':'인원(명)'})
behavior_teenage['비율(%)'] = round(behavior_teenage['인원(명)']/len(behavior_df[behavior_df['teenage']==1]) * 100, 2)
behavior_teenage.index_name = '행동'
behavior_teenage.sort_values(by='인원(명)', ascending=False).reset_index()

Unnamed: 0,behavior,인원(명),비율(%)
0,핸드폰,5,83.33
1,대화,1,16.67


### 어린이의 행동

In [28]:
behavior_child = pd.DataFrame(behavior_df[behavior_df['child']==1].groupby('behavior')['date'].size()).rename(columns = {'date':'인원(명)'})
behavior_child['비율(%)'] = round(behavior_child['인원(명)']/len(behavior_df[behavior_df['child']==1]) * 100, 2)
behavior_child.index_name = '행동'
behavior_child.sort_values(by='인원(명)', ascending=False).reset_index()

Unnamed: 0,behavior,인원(명),비율(%)
0,대화,5,41.67
1,핸드폰,4,33.33
2,멍,2,16.67
3,이어폰,1,8.33


## 행동 컬럼의 missing data 채우기 : 연령대별 행동의 분포를 이용 

### 1] missing data가 있는 행의 연령대 데이터 파악
### - 총 19행 중 중년 10명, 노년 5명, 청년 4명

In [29]:
# 총 19행
behavior_missing = pd.DataFrame(not_p[not_p['behavior1'].isnull()])
len(behavior_missing)

19

In [30]:
behavior_missing['age'].value_counts()

중년    10
노년     5
청년     4
Name: age, dtype: int64

### 2] 중년층의 행동별 분포 비율 x 중년층의 행동 missing data 수 
### - 핸드폰: 5명, 멍: 2명, 잠: 2명, 이어폰: 1명

In [72]:
m_percentages = pd.DataFrame((behavior_middle_age['비율(%)']/100*10).sort_values(ascending=False))

In [76]:
m_percentages['비율(%)'] = round(m_percentages['비율(%)'])
m_percentages['비율(%)']

behavior
핸드폰    5.0
멍      2.0
잠      2.0
이어폰    1.0
대화     1.0
통화     0.0
독서     0.0
먹기     0.0
필기     0.0
Name: 비율(%), dtype: float64

In [75]:
behavior_missing_middle_age = pd.DataFrame(behavior_missing[behavior_missing['age']=='중년'])

In [33]:
behavior_missing_middle_age

Unnamed: 0,date,time,line,start,stop,count_station,time.1,crowdedness,E,N,...,cellphone,thinking,sleep,earphone,talking,calling,reading,makeup,eating,writing
6,2019. 7. 31.,18:30:00,6.0,공덕,화랑대,20.0,40.0,N,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2019. 7. 31.,18:30:00,6.0,공덕,화랑대,20.0,40.0,N,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131,2019. 8. 3.,10:32:00,5.0,왕십리,광화문,7.0,14.0,N,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
132,2019. 8. 3.,10:32:00,5.0,왕십리,광화문,7.0,14.0,N,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
133,2019. 8. 3.,18:06:00,5.0,광화문,왕십리,7.0,14.0,F,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
135,2019. 8. 3.,18:11:00,5.0,광화문,왕십리,7.0,14.0,F,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
182,2019. 8. 5.,9:20:00,5.0,신길,공덕,4.0,8.0,N,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
408,2019. 8. 9.,8:50:00,6.0,화랑대,공덕,20.0,40.0,F,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
410,2019. 8. 9.,8:52:00,6.0,화랑대,공덕,20.0,40.0,F,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
761,2019. 8. 15.,12:38:00,2.0,선릉,역삼,1.0,2.0,N,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
behavior_missing_m_a_list = list(behavior_missing_middle_age.index)

In [37]:
# index list에서 값을 뽑아와서 각 평균값 에 반올림 한 값만큼 돌면서 

behavior_missing_m_a_list

[6, 9, 131, 132, 133, 135, 182, 408, 410, 761]

In [None]:
# table의 behavior column에 넣어주세요.
# 똑같이 노년층, 청년층도 해야해요.
