In [21]:
from prophet import Prophet
import pandas as pd
from sklearn.metrics import r2_score

class ProphetPredictor:
    def __init__(self, periods):
        self.periods = periods
        self.models = {}
        self.r2_scores = {}

    def fit(self, data):
        self.data_columns = data.columns
        for column in data.columns:
            df = data[[column]].reset_index()
            df.columns = ['ds', 'y']
            model = Prophet()
            model.fit(df)
            self.models[column] = model

    def predict(self):
        future_predictions = pd.DataFrame()
        
        for column, model in self.models.items():
            future = model.make_future_dataframe(periods=self.periods, freq='M')
            forecast = model.predict(future)
            future_values = forecast[['ds', 'yhat']].tail(self.periods).set_index('ds')
            future_predictions[column] = future_values['yhat']
        
        return future_predictions

    def evaluate(self, data):
        for column, model in self.models.items():
            df = data[[column]].reset_index()
            df.columns = ['ds', 'y']
            forecast = model.predict(df)
            
            y_true = df['y']
            y_pred = forecast['yhat']
            
            valid_index = y_true.notna() & y_pred.notna()
            y_true = y_true[valid_index]
            y_pred = y_pred[valid_index]
            
            if len(y_true) > 0:
                self.r2_scores[column] = r2_score(y_true, y_pred)
            else:
                self.r2_scores[column] = None  
        return self.r2_scores


In [6]:
df = pd.read_csv(r"C:\Users\choiy\OneDrive\바탕 화면\마스터_물질별식중독_A04_찐막.csv")

df_region = pd.read_csv(r"C:\Users\choiy\Downloads\Foodborne_Region_MasterTable.csv")
df_cause = pd.read_excel(r"C:\Users\choiy\Downloads\Foodborne_Cause_MasterTable4.xlsx")

In [4]:
# df = df[['OCCRNC_YEAR', 'OCCRNC_MM','CPI_전국','avg_temp', 'avg_high_temp', 'avg_low_temp',
#        'avg_precip', 'avg_windspeed', 'max_windspeed', 'avg_pressure',
#        'max_pressure', 'min_pressure', 'avg_sea_pressure', 'max_sea_pressure',
#        'min_sea_pressure', 'avg_relative_humidity', 'min_relative_humidity',
#        'sum_sunshine_hour', 'global_horizontal_irradiance', '인구', '초등학생인구',
#        '중학생인구', '고등학생인구', '60세이상인구수', '학생수총계']]

In [7]:
def date(row):
    if int(row['OCCRNC_MM']) <10:
        date = str(row['OCCRNC_YEAR']).split('.')[0]+'-0'+str(row['OCCRNC_MM']).split('.')[0]
        
    else:
        date = str(row['OCCRNC_YEAR']).split('.')[0]+'-'+str(row['OCCRNC_MM']).split('.')[0]
    
    return date
    

In [8]:
df_region['ds'] = df_region.apply(date, axis = 1)
df = df_region.set_index('ds')
# df = df.drop(columns = ['OCCRNC_YEAR', 'OCCRNC_MM'])

In [85]:
df_cause['ds'] = df_cause.apply(date, axis = 1)
df = df_cause.set_index('ds')
# df = df.drop(columns = ['OCCRNC_YEAR', 'OCCRNC_MM'])

In [9]:
df

Unnamed: 0_level_0,OCCRNC_YEAR,OCCRNC_MM,OCCRNC_REGN,OCCRNC_CNT,PATNT_CNT,OCCRNC_IND,HOL_IND,HOL_DUR,CPI_VALUE,WTHR_AVG_TEMP,...,POP_MID_CNT,POP_HIGH_CNT,POP_60P_CNT,POP_STU_PER_SCH,POP_ELM_RATIO,POP_MID_RATIO,POP_HIGH_RATIO,POP_60P_RATIO,POP_DENS,GMS_LIC_CNT
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-12,2022,12,강원,0.0,0.0,0,0,0,110.730,-3.6,...,38360.00000,39568.00000,499873.0000,223.614221,0.046856,0.024966,0.025752,0.325333,91.294431,3
2022-11,2022,11,강원,0.0,0.0,0,0,0,110.480,8.2,...,38379.00000,39590.00000,498694.0000,223.614221,0.047022,0.024972,0.025760,0.324488,91.316119,2
2022-10,2022,10,강원,1.0,3.0,1,1,6,110.540,12.3,...,38419.00000,39469.00000,497248.0000,223.614221,0.047196,0.024991,0.025674,0.323447,91.344401,3
2022-09,2022,9,강원,5.0,77.0,1,1,4,110.210,19.4,...,38536.00000,39378.00000,495087.0000,223.614221,0.047382,0.025066,0.025613,0.322030,91.347788,3
2022-08,2022,8,강원,0.0,0.0,0,1,3,109.910,24.0,...,38619.00000,39186.00000,494428.0000,223.614221,0.047457,0.025097,0.025465,0.321307,91.431269,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2002-05,2002,5,충북,0.0,0.0,0,0,0,68.125,16.8,...,58497.51577,62053.86564,214974.0128,,0.087676,0.039131,0.041510,0.143805,201.154615,7
2002-04,2002,4,충북,0.0,0.0,0,1,3,67.866,13.0,...,58532.11944,62258.52411,214312.8749,,0.087565,0.039146,0.041638,0.143332,201.197233,4
2002-03,2002,3,충북,1.0,25.0,1,1,3,67.351,6.9,...,58577.53256,62479.97672,213624.4780,,0.087445,0.039168,0.041777,0.142840,201.241619,6
2002-02,2002,2,충북,0.0,0.0,0,1,9,67.158,0.3,...,58626.98646,62688.71026,212998.1390,,0.087332,0.039193,0.041908,0.142393,201.282014,6


In [29]:
# data = data.drop(["OCCRNC_YEAR", 'OCCRNC_MM','OCCRNC_CNT','PATNT_CNT','OCCRNC_IND'], axis = 1)
data_reversed = data.iloc[::-1]
data_reversed

Unnamed: 0_level_0,HOL_IND,HOL_DUR,CPI_VALUE,WTHR_AVG_TEMP,WTHR_AVG_H_TEMP,WTHR_AVG_L_TEMP,WTHR_AVG_PRECIP,WTHR_AVG_WNDSPD,WTHR_MX_WNDSPD,WTHR_AVG_PRESS,...,POP_MID_CNT,POP_HIGH_CNT,POP_60P_CNT,POP_STU_PER_SCH,POP_ELM_RATIO,POP_MID_RATIO,POP_HIGH_RATIO,POP_60P_RATIO,POP_DENS,GMS_LIC_CNT
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2002-01,0,0,65.403,0.3,3.9,-2.9,37.4,2.3,7.9,1010.3,...,373486.5215,406132.6126,9.621662e+05,,0.076472,0.036410,0.039593,0.093798,16940.45895,16
2002-02,1,9,65.650,2.9,7.1,-0.8,2.4,2.0,7.4,1013.1,...,373341.7205,404184.9405,9.666336e+05,,0.076508,0.036415,0.039423,0.094284,16931.55437,24
2002-03,1,3,66.083,7.6,12.4,3.6,31.5,2.1,6.3,1006.4,...,373258.2672,402503.2587,9.706707e+05,,0.076534,0.036424,0.039278,0.094721,16923.71609,37
2002-04,1,3,66.454,13.6,18.6,8.8,155.1,2.3,11.7,1004.4,...,373218.2789,400727.2066,9.751426e+05,,0.076556,0.036438,0.039124,0.095205,16915.26449,14
2002-05,0,0,66.764,18.2,23.5,13.3,58.0,2.3,7.5,1001.7,...,373232.0157,399094.3035,9.794725e+05,,0.076571,0.036457,0.038983,0.095673,16907.31214,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-08,1,3,107.650,25.7,28.9,23.1,564.8,2.3,8.6,996.8,...,218398.0000,220772.0000,2.366474e+06,596.36357,0.043838,0.023017,0.023267,0.249406,15678.00107,15
2022-09,1,4,107.780,22.4,27.2,18.2,201.5,2.4,8.0,1003.1,...,217606.0000,221216.0000,2.356583e+06,596.36357,0.043858,0.023025,0.023407,0.249354,15615.73158,12
2022-10,1,6,108.120,14.6,19.7,10.3,124.1,2.2,6.8,1011.1,...,217055.0000,221660.0000,2.364795e+06,596.36357,0.043667,0.022984,0.023472,0.250409,15604.08931,8
2022-11,0,0,108.000,10.0,15.4,5.6,84.5,2.1,6.5,1010.7,...,216878.0000,221915.0000,2.370901e+06,596.36357,0.043464,0.022982,0.023516,0.251239,15592.71140,9


In [30]:
# data = df[df['OCCRNC_REGN'] == '서울'].drop('OCCRNC_REGN', axis = 1)

data = data_reversed

predictor = ProphetPredictor(periods=12)

predictor.fit(data)

predictions = predictor.predict()

# print(predictions)

result = pd.DataFrame(predictions)

r2_scores = predictor.evaluate(data)
# print(r2_scores)

13:53:28 - cmdstanpy - INFO - Chain [1] start processing
13:53:28 - cmdstanpy - INFO - Chain [1] done processing
13:53:28 - cmdstanpy - INFO - Chain [1] start processing
13:53:28 - cmdstanpy - INFO - Chain [1] done processing
13:53:28 - cmdstanpy - INFO - Chain [1] start processing
13:53:28 - cmdstanpy - INFO - Chain [1] done processing
13:53:28 - cmdstanpy - INFO - Chain [1] start processing
13:53:29 - cmdstanpy - INFO - Chain [1] done processing
13:53:29 - cmdstanpy - INFO - Chain [1] start processing
13:53:29 - cmdstanpy - INFO - Chain [1] done processing
13:53:29 - cmdstanpy - INFO - Chain [1] start processing
13:53:29 - cmdstanpy - INFO - Chain [1] done processing
13:53:29 - cmdstanpy - INFO - Chain [1] start processing
13:53:29 - cmdstanpy - INFO - Chain [1] done processing
13:53:29 - cmdstanpy - INFO - Chain [1] start processing
13:53:29 - cmdstanpy - INFO - Chain [1] done processing
13:53:30 - cmdstanpy - INFO - Chain [1] start processing
13:53:30 - cmdstanpy - INFO - Chain [1]

In [31]:
print(r2_scores)

{'HOL_IND': 0.3143761122390898, 'HOL_DUR': 0.31609044255593, 'CPI_VALUE': 0.9968303293658348, 'WTHR_AVG_TEMP': 0.9821827952433344, 'WTHR_AVG_H_TEMP': 0.9795591555117962, 'WTHR_AVG_L_TEMP': 0.9825507086166968, 'WTHR_AVG_PRECIP': 0.5465743962263614, 'WTHR_AVG_WNDSPD': 0.5847426557251874, 'WTHR_MX_WNDSPD': 0.2044489502927992, 'WTHR_AVG_PRESS': 0.9556457549681707, 'WTHR_MX_PRESS': 0.9292209602283638, 'WTHR_MN_PRESS': 0.19186107911230765, 'WTHR_AVG_SEA_PRESS': 0.9600719677459414, 'WTHR_MX_SEA_PRESS': 0.9347028008386117, 'WTHR_MN_SEA_PRESS': 0.6781743383690886, 'WTHR_AVG_RHUM': 0.7560024493268285, 'WTHR_MN_RHUM': 0.7620884031196721, 'WTHR_SUM_SUNHR': 0.6000390320432596, 'FST_CNT': 0.4156307551663211, 'FST_IND': 0.4845986421191948, 'POP_GEN_CNT': 0.9967726860658308, 'POP_ELM_CNT': 0.9999578732244321, 'POP_MID_CNT': 0.9996954949492788, 'POP_HIGH_CNT': 0.9993666260963326, 'POP_60P_CNT': 0.9996592910478845, 'POP_STU_PER_SCH': 0.9995085925725405, 'POP_ELM_RATIO': 0.9998913564683222, 'POP_MID_RATI

In [None]:
df = pd.DataFrame(r2_scores)
print(df)

In [33]:
df = pd.DataFrame.from_dict(r2_scores, orient='index', columns=['R2'])

In [35]:
df.to_excel(r"C:\Users\choiy\Downloads\r2scores.xlsx")

In [67]:
city_predict = {}

cities = list(df_region.OCCRNC_REGN.unique())

for city in cities:

    data = df[df['OCCRNC_REGN'] == city].drop('OCCRNC_REGN', axis = 1)
    

    predictor = ProphetPredictor(periods=12)

    predictor.fit(data)

    predictions = predictor.predict()

    # print(predictions)

    result = pd.DataFrame(predictions)

    r2_scores = predictor.evaluate(data)
    # print(r2_scores)

    city_predict[city] = result

15:00:45 - cmdstanpy - INFO - Chain [1] start processing
15:00:45 - cmdstanpy - INFO - Chain [1] done processing
15:00:45 - cmdstanpy - INFO - Chain [1] start processing
15:00:46 - cmdstanpy - INFO - Chain [1] done processing
15:00:46 - cmdstanpy - INFO - Chain [1] start processing
15:00:46 - cmdstanpy - INFO - Chain [1] done processing
15:00:46 - cmdstanpy - INFO - Chain [1] start processing
15:00:46 - cmdstanpy - INFO - Chain [1] done processing
15:00:46 - cmdstanpy - INFO - Chain [1] start processing
15:00:46 - cmdstanpy - INFO - Chain [1] done processing
15:00:46 - cmdstanpy - INFO - Chain [1] start processing
15:00:46 - cmdstanpy - INFO - Chain [1] done processing
15:00:46 - cmdstanpy - INFO - Chain [1] start processing
15:00:47 - cmdstanpy - INFO - Chain [1] done processing
15:00:47 - cmdstanpy - INFO - Chain [1] start processing
15:00:47 - cmdstanpy - INFO - Chain [1] done processing
15:00:47 - cmdstanpy - INFO - Chain [1] start processing
15:00:47 - cmdstanpy - INFO - Chain [1]

In [3]:
data = df[df['OCCRNC_VIRS'] == '노로바이러스'].drop('OCCRNC_VIRS', axis = 1)

predictor = ProphetPredictor(periods=12)

predictor.fit(data)

predictions = predictor.predict()

# print(predictions)

result = pd.DataFrame(predictions)

NameError: name 'df' is not defined

In [2]:
r2_scores = predictor.evaluate(data)
print(r2_scores)

NameError: name 'predictor' is not defined

In [89]:
result.to_excel('./prediction/cause_prediction_12months.xlsx', index=False)

In [62]:
for city in city_predict:
    data = city_predict[city].drop(['OCCRNC_CNT','PATNT_CNT','OCCRNC_IND'], axis=1)
    # data.to_excel(f'./prediction/region_prediction_{city}.xlsx')
    data = data.insert(2, )

In [82]:

data = city_predict['서울'].drop(['OCCRNC_CNT','PATNT_CNT','OCCRNC_IND'], axis=1)

for idx, i in data.iterrows():
    print(idx)

2022-12-31 00:00:00
2023-01-31 00:00:00
2023-02-28 00:00:00
2023-03-31 00:00:00
2023-04-30 00:00:00
2023-05-31 00:00:00
2023-06-30 00:00:00
2023-07-31 00:00:00
2023-08-31 00:00:00
2023-09-30 00:00:00
2023-10-31 00:00:00
2023-11-30 00:00:00
