In [1]:
import pandas as pd
import numpy as np

from ISLP import load_data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression, Lasso

from mlxtend.evaluate import bias_variance_decomp

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('MSCI446_ Data - Sheet1.csv')

print(df)

           Fire Name  Year Season Started Locations Affected  \
0         Cagua Fire  2020         Winter             Aragua   
1      Vietnam Fires  2019         Summer            Da Nang   
2    Chile Wildfires  2024         Winter          O'Higgins   
3    Chile Wildfires  2024         Winter         Valparaíso   
4    Chile Wildfires  2023         Winter              Maule   
..               ...   ...            ...                ...   
300              NaN  2020         Summer                 MB   
301              NaN  2020         Summer                 MB   
302              NaN  2020           Fall                 MB   
303              NaN  2020         Summer                 NT   
304              NaN  2020         Summer                 NT   

                                      Temperature (F)  \
0    [75, 75, 73, 73, 72, 72, 71, 72, 72, 78, 82, 81]   
1    [79, 81, 81, 82, 84, 86, 88, 90, 91, 93, 95, 97]   
2    [78, 78, 78, 80, 82, 78, 77, 77, 77, 75, 73, 73]   
3  

In [2]:
df = df.drop(['Fire Name'], axis=1)
df = df.dropna()
df.isna().sum()

Year                  0
Season Started        0
Locations Affected    0
Temperature (F)       0
Dewpoint (F)          0
Humidity (%)          0
Wind Speed (mph)      0
Pressure (in)         0
Percipitation (in)    0
Condition             0
Type Of Location      0
Y-Value               0
dtype: int64

In [3]:
fire_map = {'Fire': 1, 'Fire\r\n': 1, 'No Fire': 0, 'Not Fire': 0}
df['Y-Value_encoded'] = df['Y-Value'].replace(fire_map)
df['Y-Value_encoded'].value_counts()

Y-Value_encoded
1    120
0    115
Name: count, dtype: int64

In [4]:
misspell_map = {'Wnter': 'Winter', 'Sring': 'Spring'}
df['Season Started'] = df['Season Started'].replace(misspell_map)
a = df['Season Started'].unique()
replace_dict = {k:v for v, k in enumerate(a)}
print(replace_dict)
df['Season Started_enc'] = df['Season Started'].replace(replace_dict)
df['Season Started_enc'].value_counts()

{'Winter': 0, 'Summer': 1, 'Spring': 2, 'Fall': 3}


Season Started_enc
1    89
2    53
3    47
0    46
Name: count, dtype: int64

In [5]:
b = df['Locations Affected'].unique()
replace_dict = {k:v for v, k in enumerate(b)}
print(replace_dict)
df['Locations Affected_enc'] = df['Locations Affected'].replace(replace_dict)
df['Locations Affected_enc'].value_counts()

{'Aragua': 0, 'Da Nang': 1, "O'Higgins": 2, 'Valparaíso': 3, 'Maule': 4, 'Bulgakovsky': 5, 'Patagonia': 6, 'Araucania': 7, 'Mato Grosso do Sul': 8, 'Amazonas': 9, 'Acre': 10, 'Rondônia': 11, 'Myagdi': 12, 'Pando': 13, 'Ulaanbaatar': 14, 'Santiago del Estero': 15, 'Córdoba': 16, 'New Mexico': 17, 'Nuevo León': 18, 'Scotland': 19, 'Kemerovo': 20, 'AB': 21, 'Guanajuato': 22, 'CA': 23, 'OR': 24, 'WY': 25, 'BC': 26, 'WA': 27, 'ON': 28, 'NY': 29, 'Meghalaya': 30, 'North Rhine-Westphalia': 31, 'South Holland': 32, 'MB': 33, 'Yakutsk': 34, 'Heilongjiang': 35, 'MN': 36, 'SK': 37, 'YT': 38, 'Luxembourg': 39, 'DC': 40, 'IL': 41, 'MA': 42, 'TX': 43, 'Lyon': 44, 'NT': 45, 'NF': 46, 'Ashland': 47, 'Challis': 48, 'Ravendale': 49, 'Black Forest': 50, 'Yarnell': 51, 'Jordan Valley': 52, 'Quebec': 53, 'Yosemite National Park': 54, 'Hailey': 55, 'Okanogan County': 56, 'Yellowknife': 57, 'Oklahoma': 58, 'Fort McMurray': 59, 'Tennessee': 60, 'Ashcroft': 61, 'Montana': 62, 'Santa Rosa': 63, 'Santa Barbara':

Locations Affected_enc
78    23
57    12
76    12
17     9
75     9
      ..
40     1
39     1
38     1
37     1
54     1
Name: count, Length: 109, dtype: int64

In [6]:
b = df['Type Of Location'].unique()
replace_dict = {k:v for v, k in enumerate(b)}
print(replace_dict)
df['Type Of Location_enc'] = df['Type Of Location'].replace(replace_dict)
df['Type Of Location_enc'].value_counts()

{'Mounatins': 0, 'Coastal Hills': 1, 'Mountains': 2, 'Costal Mountain': 3, 'Hills': 4, 'Costal Forest': 5, 'Forest': 6, 'Rainforest': 7, 'Coastal Lowland': 8, 'Desert': 9, 'Coastal Mountains': 10, 'Flat': 11, 'Coastal': 12, 'Grasslands': 13, 'Costal Mountains': 14, 'Coastal Forest': 15, 'Arctic': 16, 'Plains': 17, 'Swamp': 18}


Type Of Location_enc
6     77
2     59
4     16
16    11
15    10
10    10
7      8
11     8
13     6
3      6
17     6
9      4
18     3
1      3
5      3
8      2
12     1
14     1
0      1
Name: count, dtype: int64

In [11]:
def str_to_list(x):
    test = ''
    if type(x) == type(test):
        values = x.replace(',','').replace('[','').replace(']','').split(' ')
        values = list(filter(None, values))
    else:
        values = x
    # Remove empty strings
    l = [float(val) for val in values]
    return l

list_columns = ['Temperature (F)', 'Dewpoint (F)', 'Humidity (%)', 'Wind Speed (mph)', 'Pressure (in)', 'Percipitation (in)']

added_cols = []

for col in list_columns:
    if (df[col].dtype == 'object'):
        df[col] = df[col].apply(str_to_list)
    
    new_columns = [col + str(x) for x in range(12)]
    added_cols = added_cols + new_columns
    df[new_columns] = pd.DataFrame(df[col].tolist(), columns=new_columns)
    print(df[new_columns])

     Temperature (F)0  Temperature (F)1  Temperature (F)2  Temperature (F)3  \
0                75.0              75.0              73.0              73.0   
1                79.0              81.0              81.0              82.0   
2                78.0              78.0              78.0              80.0   
3                77.0              81.0              84.0              88.0   
4                72.0              73.0              73.0              73.0   
..                ...               ...               ...               ...   
230              47.0              46.0              45.0              44.0   
231              57.0              57.0              56.0              56.0   
232              65.0              66.0              65.0              63.0   
233              21.0              22.0              23.0              23.0   
234              13.0              13.0               8.0              10.0   

     Temperature (F)4  Temperature (F)5  Temperatur

In [12]:
from sklearn.model_selection import train_test_split

target_var = 'Y-Value_encoded'
feature_var = ['Season Started_enc', 'Locations Affected_enc', 'Type Of Location_enc'] + added_cols
X_train, X_test, y_train, y_test = train_test_split(df[feature_var], df[target_var], test_size=0.2, shuffle=True, random_state=101)

In [19]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import RepeatedKFold

repeatkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)

loo = LeaveOneOut()

kfold = KFold(n_splits=10, shuffle=True, random_state=6969)

model = GaussianNB()
model.fit(X_train, y_train) 

prediction = model.predict(X_test)

print('CV accuracy scores are', cross_val_score(model, X_train, y_train, scoring='accuracy', cv= kfold, n_jobs=-1))

print('The average KFold scores is', np.mean(cross_val_score(model, X_train, y_train, scoring='accuracy', cv=kfold, n_jobs=-1)))
print('The average LOOCV score is', np.mean(cross_val_score(model, X_train, y_train, scoring='accuracy', cv=loo, n_jobs=-1)))
print('The average RepeatedKFold score is', np.mean(cross_val_score(model, X_train, y_train, cv=repeatkf, n_jobs=-1)))

CV accuracy scores are [0.73684211 0.84210526 0.84210526 0.68421053 0.68421053 0.73684211
 0.84210526 0.78947368 0.94444444 0.66666667]
The average KFold scores is 0.7769005847953216
The average LOOCV score is 0.776595744680851
The average RepeatedKFold score is 0.7713513513513515


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Set parameter to be searched in a range
params = {'n_neighbors': range(1,150)}

# Initiate the KNN model and GridSearchCV function
knn = KNeighborsClassifier()
grid_knn = GridSearchCV(estimator=knn, param_grid=params,
                        scoring='accuracy', cv=5)

# Fit the function to train set 
grid_knn.fit(X_train, y_train)

# Find the best parameter and see how well it performs on test set
print(grid_knn.best_params_)
print(grid_knn.score(X_test, y_test))
print(grid_knn.cv_results_['mean_test_score'])