In [5]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_moons, make_circles

import math
import statistics
from collections import Counter

In [6]:
table = pd.read_csv('../practice_data/data_pregnant.csv')

### 전체 데이터 개수: 833개

In [7]:
len(table)

833

In [8]:
table['line'].sort_values().unique()

array([ 1,  2,  3,  5,  6,  7,  9, 10, 11])

In [9]:
table['date'].sort_values().unique()

array(['2019. 7. 31.', '2019. 8. 1.', '2019. 8. 10.', '2019. 8. 11.',
       '2019. 8. 12.', '2019. 8. 13.', '2019. 8. 14.', '2019. 8. 15.',
       '2019. 8. 16.', '2019. 8. 2.', '2019. 8. 3.', '2019. 8. 5.',
       '2019. 8. 7.', '2019. 8. 8.', '2019. 8. 9.'], dtype=object)

### 임산부석에 사람이 앉아있는 경우의 수와 비율: 576, 69.15%
### 임산부석이 비어있는 경우의 수와 비율: 257, 30.85%

In [10]:
table['is_empty'].value_counts()

0    576
1    257
Name: is_empty, dtype: int64

In [11]:
len(table[table['is_empty']==0]) / len(table) * 100

69.14765906362545

In [12]:
len(table[table['is_empty']==1]) / len(table) * 100

30.85234093637455

In [13]:
table.head(1)

Unnamed: 0,date,time,line,start,stop,crowdedness,E,N,F,old_age_seat,...,cellphone,thinking,sleep,earphone,talking,calling,reading,makeup,eating,writing
0,2019. 7. 31.,19:10:00,5,공덕,여의도,N,0,1,0,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
table.columns

Index(['date', 'time', 'line', 'start', 'stop', 'crowdedness', 'E', 'N', 'F',
       'old_age_seat', 'is_empty', 'empty_count', 'female', 'male', 'age',
       'child', 'teenage', 'youth', 'middle_age', 'old_age', 'badge',
       'pregnant', 'heavy', 'behavior1', 'behavior2', 'behavior3', 'cellphone',
       'thinking', 'sleep', 'earphone', 'talking', 'calling', 'reading',
       'makeup', 'eating', 'writing'],
      dtype='object')

#### 약 70%의 확률로 임산부석에 누군가가 앉아있다. 이 사람들이 전부 임산부일까?

### 임산부석 착석자가 임산부인 경우의 수와 비율: 26, 4.51% 

In [15]:
full = table[table['is_empty']==0]
print(len(full[full['pregnant']==1]))
print(len(full[full['pregnant']==1]) / len(full) * 100)

26
4.513888888888888


#### 심각한 수치임... 그럼 대체 어떤 사람들이 앉는지 좀 보자.

In [16]:
# 성별
print('여성이 앉은 경우의 수:', len(full[full['female']==1]))
print('여성이 앉은 비율:', (len(full[full['female']==1])/len(full)*100))
print('남성이 앉은 경우의 수:', len(full[full['male']==1]))
print('남성이 앉은 비율:', (len(full[full['male']==1])/len(full)*100))

여성이 앉은 경우의 수: 448
여성이 앉은 비율: 77.77777777777779
남성이 앉은 경우의 수: 128
남성이 앉은 비율: 22.22222222222222


In [17]:
# 연령대 (경우의 수)
full.groupby('age')['date'].count()

age
노년     100
어린이     11
중년     231
청년     228
청소년      6
Name: date, dtype: int64

In [18]:
# 연령대 (비율)
full.groupby('age')['date'].count().apply(lambda x: x / len(full) * 100)

age
노년     17.361111
어린이     1.909722
중년     40.104167
청년     39.583333
청소년     1.041667
Name: date, dtype: float64

In [19]:
# 무거운 짐을 갖고 있는 경우의 수
full['heavy'].value_counts()

0.0    383
1.0    192
Name: heavy, dtype: int64

In [20]:
# 무거운 짐을 갖고 있는 비율
full['heavy'].value_counts().apply(lambda x: x / len(full) * 100)

0.0    66.493056
1.0    33.333333
Name: heavy, dtype: float64

In [21]:
full.head(1)

Unnamed: 0,date,time,line,start,stop,crowdedness,E,N,F,old_age_seat,...,cellphone,thinking,sleep,earphone,talking,calling,reading,makeup,eating,writing
0,2019. 7. 31.,19:10:00,5,공덕,여의도,N,0,1,0,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# 행동
full.groupby('behavior1')['date'].size().apply(lambda x: x / len(full) *100)

behavior1
대화      5.208333
독서      1.041667
먹기      0.520833
멍      16.493056
이어폰     5.208333
잠      13.541667
통화      1.388889
필기      0.520833
핸드폰    52.083333
화장      0.520833
Name: date, dtype: float64

In [23]:
full.groupby('behavior2')['date'].size()

behavior2
대화      2
독서      1
먹기      1
멍       4
이어폰    30
잠      10
핸드폰    16
Name: date, dtype: int64

In [24]:
full.groupby('behavior3')['date'].size()

behavior3
잠    1
Name: date, dtype: int64

In [25]:
full.groupby('crowdedness')['date'].size()

crowdedness
E     73
F    266
N    237
Name: date, dtype: int64

In [26]:
table.groupby(['crowdedness','is_empty'])['date'].size()

crowdedness  is_empty
E            0            73
             1           118
F            0           266
             1            45
N            0           237
             1            94
Name: date, dtype: int64

In [27]:
table[table['E']==1].groupby('is_empty')['date'].size().apply(lambda x: x/len(table[table['E']==1])*100)

is_empty
0    38.219895
1    61.780105
Name: date, dtype: float64

In [28]:
table[table['N']==1].groupby('is_empty')['date'].size().apply(lambda x: x/len(table[table['N']==1])*100)

is_empty
0    71.601208
1    28.398792
Name: date, dtype: float64

In [29]:
table[table['F']==1].groupby('is_empty')['date'].size().apply(lambda x: x/len(table[table['F']==1])*100)

is_empty
0    85.530547
1    14.469453
Name: date, dtype: float64

노약자석 빈자리 여부와 임산부석 빈자리 여부 관계 정리
- 노약자석에 자리가 없을 경우, 21%로 임산부석에 누군가 앉아 있음
- 노약자석에 자리가 있을 경우, 38.7%로 임산부석에 누군가 앉아 있음

In [30]:
table[table['old_age_seat']==0].groupby('is_empty')['date'].size().apply(lambda x: x/len(table)*100)  

is_empty
0    30.492197
1    21.008403
Name: date, dtype: float64

In [31]:
table[table['old_age_seat']==1].groupby('is_empty')['date'].size().apply(lambda x: x/len(table)*100)  

is_empty
0    38.655462
1     9.843938
Name: date, dtype: float64

In [32]:
table[table['old_age_seat']==0].groupby('old_age')['date'].size() #여기에 퍼센테이지를 어떻게 구할까요..

old_age
0.0    206
1.0     48
Name: date, dtype: int64

## C. 호선별 열차칸 상황과 임산부석 관계	

1) 붐빔정도와 임산부석 착석 여부

2) 노약자석 빈자리 여부와 노년 이용자 착석 여부

In [33]:
table.tail()

Unnamed: 0,date,time,line,start,stop,crowdedness,E,N,F,old_age_seat,...,cellphone,thinking,sleep,earphone,talking,calling,reading,makeup,eating,writing
828,2019. 8. 16.,18:16:00,1,신길,온수,F,0,0,1,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
829,2019. 8. 16.,18:16:00,1,신길,온수,F,0,0,1,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
830,2019. 8. 16.,18:29:00,1,신길,온수,F,0,0,1,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
831,2019. 8. 16.,18:40:00,7,온수,신중동,F,0,0,1,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
832,2019. 8. 16.,18:40:00,7,온수,신중동,F,0,0,1,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
table.dtypes

date             object
time             object
line              int64
start            object
stop             object
crowdedness      object
E                 int64
N                 int64
F                 int64
old_age_seat      int64
is_empty          int64
empty_count     float64
female          float64
male            float64
age              object
child           float64
teenage         float64
youth           float64
middle_age      float64
old_age         float64
badge           float64
pregnant        float64
heavy           float64
behavior1        object
behavior2        object
behavior3        object
cellphone       float64
thinking        float64
sleep           float64
earphone        float64
talking         float64
calling         float64
reading         float64
makeup          float64
eating          float64
writing         float64
dtype: object

In [35]:
table['line'].sort_values().unique()

array([ 1,  2,  3,  5,  6,  7,  9, 10, 11])

In [36]:
table['line'].dtypes

dtype('int64')

### 임산부가 아닌 사람이 탄 테이블을 만든다

In [37]:
not_empty = table[table['is_empty']== 0]

In [38]:
not_pregnant = not_empty[not_empty['pregnant'] == 0]

In [39]:
not_pregnant['pregnant'].unique()

array([0.])

### C-1) 호선별 붐빔정도와 임산부석의 비임산부 착석여부

#### 1. 전체 숫자 확인

In [92]:
pd.DataFrame(not_pregnant.groupby(['line','crowdedness'])['date'].size()).rename(columns={'date':'count_people'})

Unnamed: 0_level_0,Unnamed: 1_level_0,count_people
line,crowdedness,Unnamed: 2_level_1
1,E,23
1,F,99
1,N,86
2,E,1
2,N,4
3,F,1
5,E,14
5,F,63
5,N,59
6,E,9


#### 각 호선별 붐빔정도에 따라 퍼센트를 구해줄 수 있는 함수 작성

In [93]:
def func_percentage(line):
    percentage = []
    for i in line['crowdedness'].unique():
        percentage.append(round(float(line[line['crowdedness'] == i]['count_people'])/line['count_people'].sum()*100,2))
    percentage
    return percentage

#### 라인별 착석자수로 이루어진 새로운 테이블 생성
퍼센트를 담을 list 생성

In [71]:
lines_total_counts = pd.DataFrame(not_pregnant.groupby(['line','crowdedness'])['date'].size()).reset_index().rename(columns={'date':'count_people'})
lines_total_counts

Unnamed: 0,line,crowdedness,count_people
0,1,E,23
1,1,F,99
2,1,N,86
3,2,E,1
4,2,N,4
5,3,F,1
6,5,E,14
7,5,F,63
8,5,N,59
9,6,E,9


In [85]:
lines_p_list = []

#### <1호선> 붐빔정도에 따른 임산부석 착석 확률

In [95]:
line1 = lines_total_counts[lines_total_counts['line'] == 1]
line1['people_percentage'] = func_percentage(line1)
line1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,line,crowdedness,count_people,people_percentage
0,1,E,23,11.06
1,1,F,99,47.6
2,1,N,86,41.35


#### <2호선> 붐빔정도에 따른 임산부석 착석 확률

In [96]:
line2 = lines_total_counts[lines_total_counts['line'] == 2]
line2['people_percentage'] = func_percentage(line2)
line2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,line,crowdedness,count_people,people_percentage
3,2,E,1,20.0
4,2,N,4,80.0


#### <3호선> 붐빔정도에 따른 임산부석 착석 확률

In [97]:
line3 = lines_total_counts[lines_total_counts['line'] == 3]
line3['people_percentage'] = func_percentage(line3)
line3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,line,crowdedness,count_people,people_percentage
5,3,F,1,100.0


#### <5호선> 붐빔정도에 따른 임산부석 착석 확률

In [60]:
line5 = lines_total_counts[lines_total_counts['line'] == 5]
line5['people_percentage'] = func_percentage(line5)
line5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,line,crowdedness,count_people,people_percentage
6,5,E,14,10.29
7,5,F,63,46.32
8,5,N,59,43.38


#### <6호선> 붐빔정도에 따른 임산부석 착석 확률

In [61]:
line6 = lines_total_counts[lines_total_counts['line'] == 6]
func_percentage(line6)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,line,crowdedness,count_people,people_percentage
9,6,E,9,11.69
10,6,F,15,19.48
11,6,N,53,68.83


#### <7호선> 붐빔정도에 따른 임산부석 착석 확률

In [62]:
line7 = lines_total_counts[lines_total_counts['line'] == 7]
func_percentage(line7)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,line,crowdedness,count_people,people_percentage
12,7,E,12,23.53
13,7,F,24,47.06
14,7,N,15,29.41


#### <9호선> 붐빔정도에 따른 임산부석 착석 확률

In [63]:
line9 = lines_total_counts[lines_total_counts['line'] == 9]
func_percentage(line9)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,line,crowdedness,count_people,people_percentage
15,9,E,1,12.5
16,9,F,5,62.5
17,9,N,2,25.0


#### <분당선> 붐빔정도에 따른 임산부석 착석 확률

In [64]:
line10 = lines_total_counts[lines_total_counts['line'] == 10]
func_percentage(line10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,line,crowdedness,count_people,people_percentage
18,10,E,6,9.84
19,10,F,44,72.13
20,10,N,11,18.03


#### <경의중앙선> 붐빔정도에 따른 확률 확인

In [65]:
line11 = lines_total_counts[lines_total_counts['line'] == 11]
func_percentage(line11)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,line,crowdedness,count_people,people_percentage
21,11,F,3,100.0


#### 각 호선별 퍼센테이지 컬럼 전체보기

In [90]:
for i in lines_total_counts['line'].unique():
    lines_p_list.append(func_percentage(lines_total_counts[lines_total_counts['line'] == i]))

    
lines_p_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


[   line crowdedness  count_people  people_percentage
 0     1           E            23              11.06
 1     1           F            99              47.60
 2     1           N            86              41.35,
    line crowdedness  count_people  people_percentage
 3     2           E             1               20.0
 4     2           N             4               80.0,
    line crowdedness  count_people  people_percentage
 5     3           F             1              100.0,
    line crowdedness  count_people  people_percentage
 6     5           E            14              10.29
 7     5           F            63              46.32
 8     5           N            59              43.38,
     line crowdedness  count_people  people_percentage
 9      6           E             9              11.69
 10     6           F            15              19.48
 11     6           N            53              68.83,
     line crowdedness  count_people  people_percentage
 12     7         

In [70]:
lines_status_sizes = pd.DataFrame(not_pregnant.groupby(['line'])['crowdedness'].count())
lines_status_sizes

Unnamed: 0_level_0,crowdedness
line,Unnamed: 1_level_1
1,208
2,5
3,1
5,136
6,77
7,51
9,8
10,61
11,3


#### 3. 호선별 비임산부 착석 수 퍼센테이지

In [71]:
total_set_lc['line']

NameError: name 'total_set_lc' is not defined