In [36]:
# 패키지 임포트
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

In [37]:
# seoul-crime.csv 데이터 읽기
seoul_crime = pd.read_csv("data-files/seoul-crime.csv", encoding="utf-8")

In [38]:
# 기본 전처리 (컬럼이름 변경, 행 제거, ...)
# 컬럼 이름 변경
zipped_data = zip(seoul_crime.columns, ["자치구", "발생합계", "검거합계", "살인발생", "살인검거", 
                                        "강도발생", "강도검거", "강간발생", "강간검거", 
                                        "절도발생", "절도검거", "폭력발생", "폭력검거"])
column_map = { c1:c2 for c1, c2 in zipped_data }
seoul_crime.rename(columns=column_map, inplace=True)

In [39]:
# 자치구 컬럼을 index로 변경
seoul_crime.set_index("자치구", inplace=True)

In [40]:
seoul_crime.head()

Unnamed: 0_level_0,발생합계,검거합계,살인발생,살인검거,강도발생,강도검거,강간발생,강간검거,절도발생,절도검거,폭력발생,폭력검거
자치구,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
소계,80445,58012,124,112,122,116,4911,4446,33531,17950,41757,35388
종로구,2712,2755,6,3,4,4,156,757,1079,743,1467,1248
중구,2861,2072,6,6,6,6,161,97,1279,739,1409,1224
용산구,2381,1659,3,2,1,1,141,103,945,484,1291,1069
성동구,2112,1510,3,4,4,1,110,82,905,497,1090,926


In [41]:
# (살인, 강도, 강간, 절도, 폭력) 검거율 컬럼 만들기
# seoul_crime["살인검거율"] = seoul_crime["살인검거"] / seoul_crime["살인발생"] * 100
for col in ["살인", "강도", "강간", "절도", "폭력"]:
    seoul_crime["{0}검거율".format(col)] = \
        seoul_crime["{0}검거".format(col)] / seoul_crime["{0}발생".format(col)] * 100

In [42]:
seoul_crime.head()

Unnamed: 0_level_0,발생합계,검거합계,살인발생,살인검거,강도발생,강도검거,강간발생,강간검거,절도발생,절도검거,폭력발생,폭력검거,살인검거율,강도검거율,강간검거율,절도검거율,폭력검거율
자치구,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
소계,80445,58012,124,112,122,116,4911,4446,33531,17950,41757,35388,90.322581,95.081967,90.53146,53.532552,84.747467
종로구,2712,2755,6,3,4,4,156,757,1079,743,1467,1248,50.0,100.0,485.25641,68.860056,85.071575
중구,2861,2072,6,6,6,6,161,97,1279,739,1409,1224,100.0,100.0,60.248447,57.779515,86.870121
용산구,2381,1659,3,2,1,1,141,103,945,484,1291,1069,66.666667,100.0,73.049645,51.216931,82.804028
성동구,2112,1510,3,4,4,1,110,82,905,497,1090,926,133.333333,25.0,74.545455,54.917127,84.954128


In [43]:
# ~검거 컬럼 제거
seoul_crime.drop(["살인검거", "강도검거", "강간검거", "절도검거", "폭력검거"], axis=1, inplace=True)

In [44]:
# 소계 행 제거
seoul_crime.drop("소계", axis=0, inplace=True)

In [45]:
seoul_crime.head()

Unnamed: 0_level_0,발생합계,검거합계,살인발생,강도발생,강간발생,절도발생,폭력발생,살인검거율,강도검거율,강간검거율,절도검거율,폭력검거율
자치구,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
종로구,2712,2755,6,4,156,1079,1467,50.0,100.0,485.25641,68.860056,85.071575
중구,2861,2072,6,6,161,1279,1409,100.0,100.0,60.248447,57.779515,86.870121
용산구,2381,1659,3,1,141,945,1291,66.666667,100.0,73.049645,51.216931,82.804028
성동구,2112,1510,3,4,110,905,1090,133.333333,25.0,74.545455,54.917127,84.954128
광진구,3087,2202,5,4,208,1414,1456,60.0,100.0,77.403846,53.323904,87.912088


In [46]:
# ~발생 -> ~ 으로 변경 (발생 단어 제거 : 살인발생 -> 살인)
rename_map = { c: c[:2] for c in seoul_crime.columns[2:7] }
seoul_crime.rename(columns=rename_map, inplace=True)

In [47]:
seoul_crime.head()

Unnamed: 0_level_0,발생합계,검거합계,살인,강도,강간,절도,폭력,살인검거율,강도검거율,강간검거율,절도검거율,폭력검거율
자치구,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
종로구,2712,2755,6,4,156,1079,1467,50.0,100.0,485.25641,68.860056,85.071575
중구,2861,2072,6,6,161,1279,1409,100.0,100.0,60.248447,57.779515,86.870121
용산구,2381,1659,3,1,141,945,1291,66.666667,100.0,73.049645,51.216931,82.804028
성동구,2112,1510,3,4,110,905,1090,133.333333,25.0,74.545455,54.917127,84.954128
광진구,3087,2202,5,4,208,1414,1456,60.0,100.0,77.403846,53.323904,87.912088


In [48]:
# 각 범죄가 가장 많은 자치구 확인
def top_n_crime_gu(crime="발생합계", order="asc", n=5):
    sorted_seoul_crime = seoul_crime.sort_values(by=crime, ascending=True if order=='asc' else False)
    return sorted_seoul_crime.iloc[:n, :]

In [49]:
top_n_crime_gu(crime="강도", order="desc")

Unnamed: 0_level_0,발생합계,검거합계,살인,강도,강간,절도,폭력,살인검거율,강도검거율,강간검거율,절도검거율,폭력검거율
자치구,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
강남구,6146,4397,12,25,578,2372,3159,91.666667,92.0,78.546713,48.903879,87.021209
영등포구,4179,2777,8,11,279,1712,2169,100.0,100.0,70.250896,48.53972,79.806362
송파구,4714,3266,3,7,247,2024,2433,100.0,85.714286,87.449393,49.160079,84.093711
동대문구,2959,2113,3,7,169,1240,1540,133.333333,114.285714,86.982249,53.145161,84.090909
관악구,4444,3134,8,7,321,1860,2248,50.0,85.714286,83.800623,49.946237,85.676157


In [50]:
# 검거율 높은 자치구 확인
def top_n_arrest_gu(crime="살인", order="asc", n=5):
    sorted_seoul_arrest = seoul_crime.sort_values(by=crime + "검거율", ascending=True if order=='asc' else False)
    return sorted_seoul_arrest.iloc[:n, :]

In [51]:
top_n_arrest_gu(crime='강도', order='desc')

Unnamed: 0_level_0,발생합계,검거합계,살인,강도,강간,절도,폭력,살인검거율,강도검거율,강간검거율,절도검거율,폭력검거율
자치구,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
금천구,2439,1741,7,3,134,1013,1282,85.714286,133.333333,76.865672,51.332675,86.427457
중랑구,3210,2405,4,6,141,1399,1660,100.0,116.666667,82.978723,59.68549,86.86747
동대문구,2959,2113,3,7,169,1240,1540,133.333333,114.285714,86.982249,53.145161,84.090909
종로구,2712,2755,6,4,156,1079,1467,50.0,100.0,485.25641,68.860056,85.071575
중구,2861,2072,6,6,161,1279,1409,100.0,100.0,60.248447,57.779515,86.870121


In [52]:
seoul_crime.min(axis=0), seoul_crime.max(axis=0)

(발생합계     1860.000000
 검거합계     1294.000000
 살인          1.000000
 강도          1.000000
 강간         72.000000
 절도        772.000000
 폭력       1012.000000
 살인검거율      50.000000
 강도검거율      25.000000
 강간검거율      52.222222
 절도검거율      45.974717
 폭력검거율      79.673512
 dtype: float64,
 발생합계     6146.000000
 검거합계     4397.000000
 살인         12.000000
 강도         25.000000
 강간        578.000000
 절도       2372.000000
 폭력       3159.000000
 살인검거율     150.000000
 강도검거율     133.333333
 강간검거율     485.256410
 절도검거율      69.111969
 폭력검거율      91.754051
 dtype: float64)

In [54]:
# 각 컬럼의 값을 0 ~ 1 사이의 값으로 변경 ( scale 변경 )

# for col in seoul_crime.columns:
#     minv = seoul_crime[col].min()
#     maxv = seoul_crime[col].max()
#     seoul_crime[col] = (seoul_crime[col] - minv) / (maxv - minv)

from sklearn.preprocessing import MinMaxScaler

transformed_values = MinMaxScaler().fit_transform(seoul_crime) # numpy array 반환
# np.min(transformed_values, axis=0), np.max(transformed_values, axis=0)

scaled_seoul_crime = pd.DataFrame(transformed_values, 
                                  columns=seoul_crime.columns, index=seoul_crime.index)

In [55]:
scaled_seoul_crime.describe()

Unnamed: 0,발생합계,검거합계,살인,강도,강간,절도,폭력,살인검거율,강도검거율,강간검거율,절도검거율,폭력검거율
count,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0
mean,0.316799,0.330802,0.36,0.161667,0.245929,0.355775,0.306605,0.449397,0.637925,0.096337,0.354887,0.417306
std,0.222214,0.215534,0.241209,0.201025,0.208324,0.243544,0.223996,0.245251,0.204873,0.189362,0.258612,0.27516
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.135091,0.172414,0.181818,0.041667,0.126482,0.159375,0.172799,0.277778,0.692308,0.051551,0.137672,0.228194
50%,0.286281,0.324847,0.272727,0.125,0.175889,0.368125,0.248719,0.5,0.692308,0.062554,0.309909,0.437118
75%,0.391974,0.37641,0.454545,0.208333,0.343874,0.46125,0.413135,0.5,0.692308,0.074674,0.510207,0.595719
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [58]:
scaled_seoul_crime.to_csv("data-files/processed-seoul-crime.csv", encoding="utf-8")