In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
import re

In [2]:
pd_data = pd.read_csv('미세먼지_PM10__월별_도시별_대기오염도_20230322213853.csv', encoding='cp949', header=[1])

# '시점' '서울특별시' '부산광역시' '대전광역시' '제주특별자치도'

pd_data = pd_data.astype(str)

date = pd_data['시점']
seoul_data = pd_data['서울특별시'].str.replace(r'[*]+', repl= r'', regex=True)
busan_data = pd_data['부산광역시'].str.replace(r'[*]+', repl= r'', regex=True)
daejeon_data = pd_data['대전광역시'].str.replace(r'[*]+', repl= r'', regex=True)
jeju_data = pd_data['제주특별자치도'].str.replace(r'[*]+', repl= r'', regex=True)

pd_data = pd.concat((date, seoul_data,busan_data,daejeon_data,jeju_data), axis=1)
# print(pd_data)


np_data = pd_data.to_numpy()


np_data = np_data.astype(np.float64)

# print(np.where(np.isin(np_data, r'[^0-9]+')))
# print(np.where(np.isin(np_data, '*')))

print(np_data)

[[2011.01   44.     38.     39.     29.  ]
 [2011.02   75.     57.     63.     52.  ]
 [2011.03   65.     58.     59.     48.  ]
 [2011.04   56.     56.     52.     50.  ]
 [2011.05   72.     76.     67.     76.  ]
 [2011.06   44.     52.     43.     43.  ]
 [2011.07   28.     41.     28.     28.  ]
 [2011.08   27.     35.     25.     29.  ]
 [2011.09   29.     33.     28.     26.  ]
 [2011.1    42.     43.     42.     40.  ]
 [2011.11   38.     41.     40.     36.  ]
 [2011.12   46.     41.     43.     36.  ]
 [2012.01   60.     45.     51.     47.  ]
 [2012.02   50.     46.     45.     39.  ]
 [2012.03   46.     47.     46.     45.  ]
 [2012.04   51.     53.     49.     41.  ]
 [2012.05   52.     58.     53.     51.  ]
 [2012.06   40.     38.     37.     23.  ]
 [2012.07   28.     40.     27.     21.  ]
 [2012.08   22.     34.     19.     19.  ]
 [2012.09   27.     35.     28.     31.  ]
 [2012.1    33.     42.     33.     37.  ]
 [2012.11   42.     47.     44.     44.  ]
 [2012.12  

#### 2021년 시도별 평균 미세먼지 농도 차이 분석 --> ANOVA 사용(지역이 4개소이므로!)

In [3]:
# 평균값 구하기

seoul_dust = np.mean(np_data[:,1])
busan_dust = np.mean(np_data[:,2])
daejeon_dust = np.mean(np_data[:,3])
jeju_dust = np.mean(np_data[:,4])

print(f"서울 : {seoul_dust}\n부산 : {busan_dust}\n대전 : {daejeon_dust}\n제주 : {jeju_dust}\n")

서울 : 42.79545454545455
부산 : 41.84848484848485
대전 : 41.20454545454545
제주 : 39.70454545454545



In [4]:
# ANOVA
list_d = [np_data[:,x] for x in range(1,5)]

print(stats.f_oneway(*list_d))

# p-value > 0.05를 만족하므로 귀무가설 수용

F_onewayResult(statistic=1.4500073764107557, pvalue=0.2273752066976529)


#### 2011~2021년 봄(3~5월), 겨울(10~12월)의 평균 미세먼지 농도 차이 분석 --> TTEST 사용(그룹이 2개이므로!)

In [5]:
list_y = [float(x) for x in range(2011,2022)]
list_s = [0.03, 0.04, 0.05]
list_w = [0.1, 0.11, 0.12]


np_spring_data = np.array([])
for i in list_y :
  for k in list_s :
    filter_s = np.isin(np_data[:, 0], i+k)
    np_spring_data = np.append(np_spring_data, np_data[:,1:][filter_s])
    # print(np_data[:,1:][filter_s])

np_winter_data = np.array([])
for i in list_y :
  for k in list_w :
    filter_s = np.isin(np_data[:, 0], i+k)
    np_winter_data = np.append(np_winter_data, np_data[:,1:][filter_s])
    # print(np_data[:,1:][filter_s])

print(np_spring_data)
print(np_winter_data)


[65. 58. 59. 48. 56. 56. 52. 50. 72. 76. 67. 76. 46. 47. 46. 45. 51. 53.
 49. 41. 52. 58. 53. 51. 55. 66. 54. 56. 52. 49. 48. 46. 55. 62. 53. 58.
 60. 50. 50. 57. 58. 53. 45. 51. 63. 71. 59. 81. 71. 56. 68. 62. 45. 48.
 40. 48. 45. 56. 49. 42. 64. 52. 59. 53. 71. 60. 64. 61. 56. 53. 49. 56.
 60. 52. 54. 47. 56. 54. 58. 59. 63. 58. 63. 50. 52. 44. 52. 43. 52. 63.
 60. 56. 42. 44. 47. 38. 69. 51. 68. 56. 41. 40. 40. 43. 52. 47. 46. 47.
 45. 33. 40. 39. 44. 36. 41. 44. 35. 32. 32. 38. 67. 60. 63. 76. 42. 37.
 41. 37. 61. 36. 56. 44.]
[42. 43. 42. 40. 38. 41. 40. 36. 46. 41. 43. 36. 33. 42. 33. 37. 42. 47.
 44. 44. 41. 36. 38. 26. 29. 38. 30. 39. 42. 43. 39. 50. 55. 47. 46. 45.
 33. 39. 29. 38. 45. 47. 39. 40. 44. 42. 43. 49. 44. 43. 50. 51. 33. 33.
 37. 41. 48. 41. 49. 42. 38. 37. 36. 42. 52. 49. 51. 43. 48. 44. 47. 38.
 29. 32. 36. 30. 42. 50. 51. 45. 50. 46. 49. 37. 28. 32. 31. 31. 52. 47.
 63. 42. 43. 40. 51. 33. 31. 27. 29. 34. 40. 35. 43. 44. 42. 35. 40. 33.
 33. 30. 33. 34. 42. 31. 

In [6]:
def do_mean_comparison(g1, g2) :
  print("*******************************")
  print('g1 sharpiro : ', stats.shapiro(g1))
  print('g2 shapiro : ', stats.shapiro(g2))
  print('levene : ', stats.levene(g1, g2))
  print('ttest_ind : ', stats.ttest_ind(g1, g2))
  print("*******************************")

In [7]:
# 평균값 차이 비교 (By t-test)
do_mean_comparison(np_spring_data, np_winter_data)

*******************************
g1 sharpiro :  ShapiroResult(statistic=0.989180326461792, pvalue=0.39251869916915894)
g2 shapiro :  ShapiroResult(statistic=0.9878075122833252, pvalue=0.29399433732032776)
levene :  LeveneResult(statistic=11.01025439381632, pvalue=0.0010345437486875143)
ttest_ind :  Ttest_indResult(statistic=12.232958045607598, pvalue=1.6003353734189516e-27)
*******************************


In [8]:
def get_95ci(vals) :
  vals_mean = np.mean(vals)
  vals_std = np.std(vals)
  print('upper : ', vals_mean + 1.96*vals_std)
  print('lower : ', vals_mean - 1.96*vals_std)
  print(vals_mean - 1.96*vals_std, '~', vals_mean, '~', vals_mean + 1.96*vals_std)

In [9]:
get_95ci(np_spring_data)
print("**********************************************************")
get_95ci(np_winter_data)

upper :  72.33428120795399
lower :  33.01420364053085
33.01420364053085 ~ 52.67424242424242 ~ 72.33428120795399
**********************************************************
upper :  53.606421535552826
lower :  25.332972403841115
25.332972403841115 ~ 39.46969696969697 ~ 53.606421535552826
