In [None]:
import sys
import os
import datetime as date
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.metrics import classification_report
from matplotlib import pyplot
from matplotlib import pylab

sys.path.append(os.getcwd())

In [1]:
NOAA_header_line_offset = 3

NOAA_dataset_query_year_start = 2014
NOAA_dataset_query_year_stop = 2022

NOAA_county_code_list = ["001", "003", "005", "007", "009", "011", "013", "015", "017", "019", "021", "023",
                         "025", "027", "029", "031", "033", "035", "037", "039", "041", "043", "045", "047",
                         "049", "051", "053", "055", "057", "059", "061", "063", "065", "067", "069", "071"]

county_list = ["Baker", "Benton", "Clackamas", "Clatsop", "Columbia", "Coos", "Crook", "Curry", "Deschutes",
               "Douglas", "Gilliam", "Grant", "Harney", "Hood River", "Jackson", "Jefferson", "Josephine",
               "Klamath", "Lake", "Lane", "Lincoln", "Linn", "Malheur", "Marion", "Morrow", "Multnomah",
               "Polk", "Sherman", "Tillamook", "Umatilla", "Union", "Wallowa", "Wasco", "Washington",
               "Wheeler", "Yamhill"]

month_list = ["January", "February", "March", "April", "May", "June", "July",
              "August", "September", "October", "November", "December"]

Since I were not able to find detailed archived daily precipitation and temperature data, I took the monthly mean temperature and precipitation as a simpler approach. The original climate data contains two parts, temperature and precipitation, I joined two tables by their corresponding county and date. My data research direction is the correlation between the occurrence of wildfires and the climate of the month. 
Currently, data from NOAA stores seperately in folder Precipitation_By_county and Temperature_By_County. The first three function is to make then into pure csv file and store them in a folder called Precipitation_temperature together for further use.

In [None]:
def delete_lines(fileName, head, targetFN):
    fin = open(fileName, 'r')
    lines = fin.readlines()
    fout = open(targetFN, 'w')
    adjusted_lines = ''.join(lines[head:])
    fout.write(adjusted_lines)


def clean_CSV():
    county_list = const.county_list
    for county in county_list:
        prec_dir = "./data/Precipitation_By_County/"
        temp_dir = "./data/Temperature_By_County/"
        target_dir = "./data/Precipitation_Temperature/"
        temp_excep = "_Temp.csv"
        prec_excep = "_Prec.csv"
        precFN = prec_dir + county + prec_excep
        tempFN = temp_dir + county + temp_excep
        lines_to_del = 3
        delete_lines(precFN, lines_to_del, target_dir+county+prec_excep)
        delete_lines(tempFN, lines_to_del, target_dir+county+temp_excep)

def merge_Data():

    dir = "./data/Precipitation_Temperature/"
    temp_excep = "_Temp.csv"
    prec_excep = "_Prec.csv"
    climate_excep = "_Climate.csv"
    climate_path = "./Merged_Data/"

    for county in county_list:
        precFN = dir + county + prec_excep
        tempFN = dir + county + temp_excep
        climateFN = climate_path + county + climate_excep

        prec_df = pd.read_csv(precFN)
        temp_df = pd.read_csv(tempFN)

        prec_df.rename(columns={'Value':'Precipitation'}, inplace=True)
        temp_df.rename(columns={'Value':"Temperature"}, inplace=True)

        climate_df = temp_df
        climate_df['Precipitation'] = prec_df['Precipitation']/30
        climate_df.to_csv(climateFN)

From NIFC, we obtained every wildfire incidents in the US and since we only need data in Oregon, we sort them out.

In [None]:
def get_OR_records():
    df = pd.read_csv("Wildland_Fire_Incident_Locations.csv", low_memory=False)
    df_clear = df[(df['POOState'] == 'US-OR') & (df['IncidentTypeCategory'] == 'WF')]
    df_column = df_clear[['X', 'Y', 'FireDiscoveryDateTime', 'InitialLatitude', 'InitialLongitude', 
                        'POOCounty', 'IncidentSize', 'FireCause']]
    df_column.to_csv("./data/Oregon_Fire_Record.csv")

For each fire incident in Oregon, assign them the Temperature and Precipitation data of that month. I take the average temperature and precipitation of each month. I don't think analyzing average precipitation together with wildfire occurrences is any more ambiguous than daily climate because the occasional lack of precipitation doesn't necessarily directly lead to increased fire risk, but long periods of low precipitation tend to dry out the soil and cause it to happen wildfire.

In [None]:
def insert_climate(fire_record, climate_dir, climate_excep):
    fire_df = pd.read_csv(fire_record)
    fire_df.insert(fire_df.shape[1], 'Temperature', -1)
    fire_df.insert(fire_df.shape[1], 'Precipitation', -1)

    for idx, record in fire_df.iterrows():
        time = pd.to_datetime(record[3])
        year_str = str(time.year)
        if (time.month < 10):
            month_str = str(0) + str(time.month)
        else:
            month_str = str(time.month)
        fireDate = int(year_str+month_str)
        fireCounty = str(record[6])

        climateFN = climate_dir + fireCounty + climate_excep
        climate_df = pd.read_csv(climateFN)

        for i, climate in climate_df.iterrows():
            if fireDate == int(climate[1]):
                fire_df.loc[idx, 'Temperature'] = climate[2]
                fire_df.loc[idx, 'Precipitation'] = climate[3]

    fire_df = fire_df.drop(columns='Unnamed: 0', axis=1)
    fire_df.to_csv("vectors.csv")

For each county and year/month, add a flag on if the county had any incident on that month.

In [None]:
def build_data():
    data = {'County':[], 'Date':[], 'Temperature':[], 'Precipitation':[], 'Is_Burned':[], 'X':[],
            'Y':[], 'FireDiscoveryDateTime':[], 'InitialLatitude':[], 'InitialLongitude':[],
            'IncidentSize':[], 'FireCause':[]}
    df = pd.DataFrame(data)
    for county in county_list:
        climate_df = pd.read_csv("./data/Merged_data/{}_Climate.csv".format(county))
        counter = 0
        for year in range(2010, 2024):
            for month in range(1, 13):
                if year == 2023 and month > 1:
                    continue
                else:
                    year_str = str(year)
                    if month < 10:
                        month_str = str(0) + str(month)
                    else:
                        month_str = str(month)
                    date_str = year_str + month_str
                    temp = climate_df.iloc[counter, 2]
                    prec = climate_df.iloc[counter, 3]
                    df.loc[len(df.index)] = [county, date_str, temp, prec, 0, '','','','','','','']
                    counter += 1
    df.to_csv('flags.csv')

def insert_flag():
    flag_df = pd.read_csv('./data/flags.csv')
    flag_df = flag_df.drop(columns='Unnamed: 0', axis=1)
    fire_df = pd.read_csv('./data/vectors.csv')
    for idx, record in fire_df.iterrows():
        county = record['POOCounty']
        time = pd.to_datetime(record['FireDiscoveryDateTime'])
        year_str = str(time.year)
        if (time.month < 10):
            month_str = str(0) + str(time.month)
        else:
            month_str = str(time.month)
        fireDate = int(year_str+month_str)
        if fireDate <= 202301:
            index = flag_df[(flag_df['County'].isin([county]))&(flag_df['Date'].isin([fireDate]))].index[0]
            if flag_df.iloc[index, 4] == 0:
                flag_df.iloc[index, 4] = 1
                flag_df.iloc[index, 5] = record["X"]
                flag_df.iloc[index, 6] = record["Y"]
                flag_df.iloc[index, 7] = record["FireDiscoveryDateTime"]
                flag_df.iloc[index, 8] = record["InitialLatitude"]
                flag_df.iloc[index, 9] = record["InitialLongitude"]
                flag_df.iloc[index, 10] = record["IncidentSize"]
                flag_df.iloc[index, 11] = record["FireCause"]
            else:
                f = flag_df.loc[index]
                df_add = pd.DataFrame({'County':[f["County"]],
                                        'Date':[f["Date"]], 
                                        'Temperature':[f["Temperature"]], 
                                        'Precipitation':[f["Precipitation"]], 
                                        'Is_Burned':[1], 
                                        'X':[record["X"]],
                                        'Y':[record["Y"]], 
                                        'FireDiscoveryDateTime':[record["FireDiscoveryDateTime"]], 
                                        'InitialLatitude':[record["InitialLatitude"]], 
                                        'InitialLongitude':[record["InitialLongitude"]],
                                        'IncidentSize':[record["IncidentSize"]], 
                                        'FireCause':[record["FireCause"]]})
                df1 = flag_df.iloc[:index, :]
                df2 = flag_df.iloc[index:, :]
                flag_df = pd.concat([df1, df_add, df2], ignore_index=True)
    flag_df.to_csv("training.csv")

Train a logistic regression model with temperature and precipitation.

In [None]:
def training():
    df = pd.read_csv("training.csv")
    df = df.drop(columns='Unnamed: 0', axis=1)
    df = df.drop(columns='County', axis=1)
    df = df.drop(columns='Date', axis=1)

    accuracy = 0

    for i in range(0,10):
      df1 = df.iloc[0 : i*300]
      df2 = df.iloc[i*300 : (i+1)*300]
      df3 = df.iloc[(i+1)*300 :]

      train_df = pd.concat([df1, df3])
      test_df = df2
      
      X_train = train_df[['Temperature', 'Precipitation']]
      Y_train = train_df[['Is_Burned']]
      Y_train = Y_train.to_numpy()
      Y_train = Y_train.reshape(-1)
      X_test = test_df[['Temperature', 'Precipitation']]
      Y_test = test_df[['Is_Burned']]
      Y_test = Y_test.to_numpy()
      Y_test = Y_test.reshape(-1)

      average = 0
      testNum = 10

      lr = LogisticRegression(max_iter=3000)
      lr.fit(X_train, Y_train)
      Y_pred = lr.predict(X_test)
      p = 0
      for i in range(len(Y_test)):
          if Y_test[i] == Y_pred[i]:
              p += 1
      p /= len(Y_test)
      accuracy += p
    print(accuracy/10)

The model with average temperature and precipitation of correspond month performs about 70% accuracy. We made a hypothesis that after several month or a long term of low precipitation and high temperature, the potential risk of wildfire will increase. So based on the previous model, we append the climate data of each previous two months to our training data as more paremeters. And we train the new logistic model with the temperature and precipitation with current, one month before and two month before.

In [None]:
def build_data_v2():
    data = {'County':[], 'Date':[], 'Temperature':[], 'Precipitation':[], 'Is_Burned':[]}
    df = pd.DataFrame(data)
    for county in county_list:
        climate_df = pd.read_csv("./Merged_data/{}_Climate.csv".format(county))
        counter = 0
        for year in range(2010, 2024):
            for month in range(1, 13):
                if year == 2023 and month > 1:
                    continue
                else:
                    year_str = str(year)
                    if month < 10:
                        month_str = str(0) + str(month)
                    else:
                        month_str = str(month)
                    date_str = year_str + month_str
                    temp = climate_df.iloc[counter, 2]
                    prec = climate_df.iloc[counter, 3]
                    df.loc[len(df.index)] = [county, date_str, temp, prec, 0]
                    counter += 1
    df2 = df
    data = {'Temperature': []}
    data2 = {'Precipitation': []}
    prev_temp_1 = pd.DataFrame(data)
    prev_temp_1.loc[0] = -1
    tempdf = df2.drop(['County', 'Date', 'Precipitation', 'Is_Burned'], axis=1)
    prev_temp_1 = pd.concat([prev_temp_1, tempdf], axis=0, ignore_index=True)
    prev_temp_1.rename(columns={'Temperature': 'Prev 1 Temp'})
    df['Prev 1 Temp'] = prev_temp_1

    prev_temp_2 = pd.DataFrame(data)
    prev_temp_2.loc[0] = -1
    prev_temp_2.loc[1] = -1
    prev_temp_2 = pd.concat([prev_temp_2, tempdf], axis=0, ignore_index=True)
    prev_temp_2.rename(columns={'Temperature': 'Prev 2 Temp'})
    df['Prev 2 Temp'] = prev_temp_2

    prev_prec_1 = pd.DataFrame(data2)
    prev_prec_1.loc[0] = -1
    tempdf = df2.drop(['County', 'Date', 'Temperature', 'Is_Burned', 'Prev 1 Temp', 'Prev 2 Temp'], axis=1)
    prev_prec_1 = pd.concat([prev_prec_1, tempdf], axis=0, ignore_index=True)
    prev_prec_1.rename(columns={'Temperature': 'Prev 1 Prec'})
    df['Prev 1 Prec'] = prev_prec_1

    prev_prec_2 = pd.DataFrame(data2)
    prev_prec_2.loc[0] = -1
    prev_prec_2.loc[1] = -1
    prev_prec_2 = pd.concat([prev_prec_2, tempdf], axis=0, ignore_index=True)
    prev_prec_2.rename(columns={'Temperature': 'Prev 2 Prec'})
    df['Prev 2 Prec'] = prev_prec_2

    for idx, row in df.iterrows():
        if int(row['Date']) < 201401:
            df = df.drop(index=idx)

    df = df[['County', 'Date', 'Temperature', 'Prev 1 Temp', 'Prev 2 Temp', 'Precipitation', 'Prev 1 Prec', 'Prev 2 Prec', 'Is_Burned']]

    df.to_csv('flags_v2.csv')

def insert_flag_v2():
    flag_df = pd.read_csv('flags_v2.csv')
    flag_df = flag_df.drop(columns='Unnamed: 0', axis=1)

    fire_df = pd.read_csv('vectors.csv')
    for idx, record in fire_df.iterrows():
        county = record['POOCounty']
        time = pd.to_datetime(record['FireDiscoveryDateTime'])
        year_str = str(time.year)
        if (time.month < 10):
            month_str = str(0) + str(time.month)
        else:
            month_str = str(time.month)
        fireDate = int(year_str+month_str)
        if fireDate <= 202301:
            index = flag_df[(flag_df['County'].isin([county]))&(flag_df['Date'].isin([fireDate]))].index[0]
            flag_df.iloc[index, 8] = 1

    flag_df.to_csv('training_v2.csv')

In [None]:
def log_reg_v2():
    df = pd.read_csv("training_v2.csv")
    df = df.drop(columns='Unnamed: 0', axis=1)
    df = df.drop(columns='County', axis=1)
    df = df.drop(columns='Date', axis=1)

    accuracy = 0

    for i in range(0,10):
        df1 = df.iloc[0 : i*300]
        df2 = df.iloc[i*300 : (i+1)*300]
        df3 = df.iloc[(i+1)*300 :]
        train_df = pd.concat([df1, df3])
        test_df = df2
        
        X_train = train_df[['Temperature', 'Prev 1 Temp', 'Prev 2 Temp', 'Precipitation', 'Prev 1 Prec', 'Prev 2 Prec']]
        Y_train = train_df[['Is_Burned']]
        Y_train = Y_train.to_numpy()
        Y_train = Y_train.reshape(-1)
        X_test = test_df[['Temperature', 'Prev 1 Temp', 'Prev 2 Temp', 'Precipitation', 'Prev 1 Prec', 'Prev 2 Prec']]
        Y_test = test_df[['Is_Burned']]
        Y_test = Y_test.to_numpy()
        Y_test = Y_test.reshape(-1)

        lr = LogisticRegression(max_iter=3000)
        lr.fit(X_train, Y_train)
        Y_pred = lr.predict(X_test)
        p = 0
        for i in range(len(Y_test)):
            if Y_test[i] == Y_pred[i]:
                p += 1
        p /= len(Y_test)
        accuracy += p
    
    print(accuracy/10)

For the second model, I ran it two times, one with only climate data from previous month and another with climate data from last two month. The accuracy was both about 73%. It is a little bit better than the previous model and it might be an evidence that after one to two month of low precipitation and high temperature, the potential risk of wildfire will increase. But two months of history climate data may already reaches a certain point of overfitting, thus it doesn't make significant different between one month and two months.