In [50]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import tqdm

In [10]:
def average_angle_ignore_nan(degrees):
    """
    주어진 각도의 리스트에서 NaN 값을 무시하고 평균을 계산합니다.
    
    :param degrees: 각도의 리스트 (0-360도)
    :return: 평균 각도 (0-360도)
    """
    # NaN 값을 무시하고 유효한 각도만 선택
    valid_degrees = [deg for deg in degrees if not pd.isna(deg)]
    
    if not valid_degrees:
        return np.nan  # 유효한 각도가 없는 경우 NaN 반환
    
    # 유효한 각도를 라디안으로 변환
    radians = np.deg2rad(valid_degrees)
    
    # x, y 좌표 계산
    x_coords = np.cos(radians)
    y_coords = np.sin(radians)
    
    # x, y 좌표의 평균 계산
    x_mean = np.mean(x_coords)
    y_mean = np.mean(y_coords)
    
    # 평균 좌표를 각도로 변환
    mean_rad = np.arctan2(y_mean, x_mean)
    mean_deg = np.rad2deg(mean_rad)
    
    # 결과를 0-360도 사이의 값으로 변환
    mean_deg = mean_deg % 360
    
    return mean_deg

In [11]:
def horizontal_average(dataframe, datetime, column, threshold = 3, digits = 1):
    '''
    동일한 시간의 여러 지점 데이터를 확인한 후 해당 데이터들의 평균값을 반환하는 함수
    threshold 값을 입력하여 허용하는 NaN 값의 최대치를 설정할 수 있음
    '''  
    # 입력된 일시와 컬럼명에 해당하는 데이터 변수화
    data = dataframe.loc[datetime, column]
    
    # 임시로 nan값 입력
    mean = np.nan
    
    # data에 NaN값의 수가 threshold를 초과하는지 확인
    if data.isna().sum() <= threshold:
        # nan값을 제외한 값으로만 리스트 생성
        value_list = [i for i in data if not pd.isna(i)]
        
        # 평균값 산출 이후 반올림
        mean = np.mean(value_list)
        mean = round(mean, digits)
               
    return mean

In [12]:
# 연속된 True 값을 계산하는 함수
def calculate_consecutive_trues(series):
    n = len(series)
    result = [0] * n
    count = 0

    # 첫 번째 패스: 연속된 True의 그룹 길이를 계산
    for i in range(n):
        if series[i]:
            count += 1
        else:
            count = 0
        result[i] = count

    # 두 번째 패스: 그룹의 마지막 True 값에 그룹의 전체 길이를 설정
    final_result = [0] * n
    i = 0
    while i < n:
        if result[i] > 0:
            length = result[i]
            for j in range(length):
                final_result[i - j] = length
            i += length
        else:
            i += 1

    return final_result

In [90]:
cn_df = pd.read_csv("C:/Users/ITSC/Documents/2015~2024_중국기상데이터_전처리.csv")

In [91]:
cn_df.head()

Unnamed: 0.1,Unnamed: 0,지점,지점명,일시,습도,16 방위 풍향,풍속,강수량,현지기압,해면기압,변화량 기압,이슬점 온도,기온
0,0,59211.0,BAISE,2015-01-01 00:00,94.1,36.0,1.0,0.0,1005.5,1026.9,2.1,7.5,8.4
1,1,59211.0,BAISE,2015-01-01 03:00,70.2,14.0,1.0,0.0,1004.2,1028.3,1.5,10.1,15.5
2,2,59211.0,BAISE,2015-01-01 06:00,39.1,9.0,2.0,0.0,1002.9,1024.0,4.1,6.4,20.8
3,3,59211.0,BAISE,2015-01-01 09:00,37.0,14.0,2.0,0.0,1002.55,1022.3,1.7,5.5,20.7
4,4,59211.0,BAISE,2015-01-01 12:00,64.9,0.0,0.0,0.0,1002.2,1023.6,1.0,7.8,14.3


In [92]:
cn_df = cn_df.drop("Unnamed: 0", axis = 1)

In [93]:
keep_list = ['HAMI', '우한', 'YINCHUAN', 'MINQIN', '지난', 'YU ZHONG', 'YUSHU', 'YAN AN', '치치하르', 'JARUD QI', '우루무치', '베이징',
 '칭따오', '쑤조우', '난징', '상하이', 'DACHEN DAO', '푸조우', '광조우']

In [94]:
cn_df = cn_df[cn_df["지점명"].isin(keep_list)]

In [95]:
cn_df = cn_df.drop("지점명", axis = 1)

In [96]:
cn_df["일시"] = cn_df["일시"].map(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M"))

In [98]:
cn_df["지점"].value_counts().sort_values()

지점
54511.0    24576
58666.0    24715
58027.0    24729
58362.0    24770
58847.0    24774
58238.0    24777
59287.0    24779
54857.0    24785
51463.0    24792
54026.0    24792
50745.0    24794
53845.0    24795
56029.0    24796
52983.0    24798
54823.0    24799
52681.0    24801
53614.0    24804
57494.0    24807
52203.0    24814
Name: count, dtype: int64

In [52]:
cn_pivot = cn_df.pivot(index = "일시", columns = "지점")

In [53]:
cn_pivot.shape

(24968, 171)

In [54]:
cn_pivot.head()

Unnamed: 0_level_0,습도,습도,습도,습도,습도,습도,습도,습도,습도,습도,...,기온,기온,기온,기온,기온,기온,기온,기온,기온,기온
지점,50745.0,51463.0,52203.0,52681.0,52983.0,53614.0,53845.0,54026.0,54511.0,54823.0,...,54823.0,54857.0,56029.0,57494.0,58027.0,58238.0,58362.0,58666.0,58847.0,59287.0
일시,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-01 00:00:00,78.8,87.0,75.1,58.6,62.3,60.8,58.1,47.9,34.0,38.1,...,-5.9,-3.6,-16.5,-3.5,-4.3,-3.2,-0.3,3.7,9.6,9.5
2015-01-01 03:00:00,54.1,83.0,58.0,44.5,34.1,40.2,33.9,37.1,18.0,16.0,...,1.0,-1.5,-7.9,6.3,3.0,3.4,3.2,4.5,11.5,16.7
2015-01-01 06:00:00,44.9,74.0,35.0,28.1,25.2,28.1,21.9,33.9,13.0,12.0,...,3.5,0.4,1.3,8.8,5.3,5.5,4.4,5.9,12.3,20.3
2015-01-01 09:00:00,65.0,75.1,31.1,25.1,19.1,22.1,22.0,33.9,16.0,14.0,...,1.8,-0.7,5.1,7.7,3.5,4.2,3.1,5.4,10.3,17.8
2015-01-01 12:00:00,56.2,83.7,47.2,32.2,39.1,36.9,31.0,38.0,21.1,17.9,...,1.4,-1.9,-3.4,-1.2,-0.8,-0.4,1.1,5.1,8.2,10.6


In [55]:
time_list = pd.date_range(start = "2015-01-01 00:00", end = "2024-05-19 21:00", freq = "3H")

In [56]:
time_df = pd.DataFrame(time_list)

In [57]:
time_df.columns = ["일시"]

In [58]:
time_df = time_df.set_index("일시")

In [59]:
time_df

2015-01-01 00:00:00
2015-01-01 03:00:00
2015-01-01 06:00:00
2015-01-01 09:00:00
2015-01-01 12:00:00
...
2024-05-19 09:00:00
2024-05-19 12:00:00
2024-05-19 15:00:00
2024-05-19 18:00:00
2024-05-19 21:00:00


In [62]:
for column in cn_pivot.columns:
    time_df[column] = np.nan

  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
  time_df[column] = np.nan
 

In [65]:
for time in cn_pivot.index:
    time_df.loc[time, :] = cn_pivot.loc[time, :]

In [102]:
na_count = [time_df.loc[i, :].isna().sum() for i in time_df.index]

In [104]:
na_count.index(171)

248

In [103]:
pd.Series(na_count).value_counts()

0      24112
171     2448
9        408
18       103
36        46
27        36
54        35
81        34
45        31
162       28
90        18
135       18
72        18
99        16
63        14
117       14
153       12
144       10
126       10
108        5
Name: count, dtype: int64

In [73]:
na_28 = [time_df.loc[i, :].isna().sum() > 28 for i in time_df.index]
na_57 = [time_df.loc[i, :].isna().sum() > 57 for i in time_df.index]

In [74]:
na_consec28 = calculate_consecutive_trues(na_28)
na_consec57 = calculate_consecutive_trues(na_57)

In [78]:
pd.Series(na_consec28).value_counts()

0       25204
1024     1024
1         656
128       256
2          64
16         64
32         64
4          52
8          32
Name: count, dtype: int64

In [99]:
na_consec.index(1024)

19604

In [105]:
time_df.iloc[248:268, :]

Unnamed: 0_level_0,"(습도, 50745.0)","(습도, 51463.0)","(습도, 52203.0)","(습도, 52681.0)","(습도, 52983.0)","(습도, 53614.0)","(습도, 53845.0)","(습도, 54026.0)","(습도, 54511.0)","(습도, 54823.0)",...,"(기온, 54823.0)","(기온, 54857.0)","(기온, 56029.0)","(기온, 57494.0)","(기온, 58027.0)","(기온, 58238.0)","(기온, 58362.0)","(기온, 58666.0)","(기온, 58847.0)","(기온, 59287.0)"
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-02-01 00:00:00,,,,,,,,,,,...,,,,,,,,,,
2015-02-01 03:00:00,,,,,,,,,,,...,,,,,,,,,,
2015-02-01 06:00:00,,,,,,,,,,,...,,,,,,,,,,
2015-02-01 09:00:00,,,,,,,,,,,...,,,,,,,,,,
2015-02-01 12:00:00,,,,,,,,,,,...,,,,,,,,,,
2015-02-01 15:00:00,,,,,,,,,,,...,,,,,,,,,,
2015-02-01 18:00:00,,,,,,,,,,,...,,,,,,,,,,
2015-02-01 21:00:00,,,,,,,,,,,...,,,,,,,,,,
2015-02-02 00:00:00,,,,,,,,,,,...,,,,,,,,,,
2015-02-02 03:00:00,,,,,,,,,,,...,,,,,,,,,,
