In [50]:
import numpy as np
import pandas as pd
import ast #for strings
import re #regex

fname='MSCI446_ Data - Sheet1.csv'

def convert_to_numpy_array(string: str):
    # Verify string is written like a python list
    stripped = string.strip()
    pattern = r"\[(.*)\]"
    match = re.match(pattern, stripped)
    if not match:
        raise NotImplementedError(f"{stripped}")
    # Group 1 captures the content inside the square brackets
    contents = match.group(1)
    
    try:
        # Use ast.literal_eval to safely evaluate string literals
        evaluated = ast.literal_eval(stripped)
        if type(evaluated) is not list:
            raise NotImplementedError()
        # series = pd.Series(evaluated)
        return np.array(evaluated)
    except (SyntaxError, ValueError):  # this occurs with the "Condition" column
        # If parsing as a list fails, split the contents within the square brackets by comma, 
        # interpret each value as a string and strip whitespace
        return np.array([ item.strip() for item in contents.split(',') ])

# Read CSV file with the custom function
numerical_vector_column_labels = ['Temperature (F)', 'Dewpoint (F)', 'Humidity (%)', 'Wind Speed (mph)', 'Pressure (in)', 'Percipitation (in)']
categorical_vector_column_labels = ["Condition"]
vector_column_labels = numerical_vector_column_labels + categorical_vector_column_labels

# Create a converters dictionary mapping each column to the converter function
converters = {col: convert_to_numpy_array for col in vector_column_labels}
df = pd.read_csv(fname, converters=converters)
df.shape

(305, 13)

In [51]:
df.head()

Unnamed: 0,Fire Name,Year,Season Started,Locations Affected,Temperature (F),Dewpoint (F),Humidity (%),Wind Speed (mph),Pressure (in),Percipitation (in),Condition,Type Of Location,Y-Value
0,Cagua Fire,2020,Winter,Aragua,"[75, 75, 73, 73, 72, 72, 71, 72, 72, 78, 82, 81]","[75, 75, 76, 76, 76, 76, 76, 76, 75, 77, 77, 77]","[85, 86, 86, 88, 90, 90, 94, 89, 74, 62, 55, 53]","[2, 3, 3, 3, 4, 4, 2, 4, 2, 4, 6, 6]","[29.8, 29.83, 29.83, 29.86, 29.86, 29.8, 29.83...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[Fair, Fair, Partly Cloudy, Fair, Fair, Partly...",Mounatins,Fire
1,Vietnam Fires,2019,Summer,Da Nang,"[79, 81, 81, 82, 84, 86, 88, 90, 91, 93, 95, 97]","[72, 73, 73, 73, 73, 75, 75, 75, 75, 75, 77, 77]","[78, 79, 79, 74, 70, 70, 66, 62, 59, 56, 56, 53]","[5, 6, 2, 5, 7, 5, 5, 7, 5, 6, 5, 2]","[29.64, 29.64, 29.64, 29.64, 29.67, 29.67, 29....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[Fair, Fair, Fair, Fair, Windy, Windy, Windy, ...",Coastal Hills,Fire
2,Chile Wildfires,2024,Winter,O'Higgins,"[78, 78, 78, 80, 82, 78, 77, 77, 77, 75, 73, 73]","[22, 8, 12, 19, 20, 18, 16, 13, 7, 41, 38, 22]","[74, 79, 84, 79, 74, 79, 83, 83, 83, 94, 94, 89]","[18, 18, 20, 21, 20, 20, 20, 20, 17, 10, 11, 11]","[30.09, 30.09, 30.06, 30.06, 30.06, 30.06, 30....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[Fair, Partly Cloudy, Mostly Cloudy, Fair, Par...",Mountains,Fire
3,Chile Wildfires,2024,Winter,Valparaíso,"[77, 81, 84, 88, 90, 90, 88, 86, 84, 81, 79, 75]","[55, 54, 55, 54, 55, 55, 52, 54, 54, 54, 54, 54]","[47, 39, 37, 31, 31, 31, 31, 31, 35, 39, 42, 47]","[3, 6, 8, 12, 13, 15, 16, 18, 15, 15, 13, 9]","[28.31, 28.31, 28.28, 28.25, 28.25, 28.22, 28....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[Fair, Fair, Fair, Fair, Fair, Fair, Fair, Fai...",Coastal Mountain,Fire
4,Chile Wildfires,2023,Winter,Maule,"[72, 73, 73, 73, 75, 77, 75, 73, 70, 68, 64, 63]","[59, 59, 59, 59, 59, 59, 57, 55, 55, 54, 52, 52]","[64, 61, 61, 57, 57, 54, 53, 53, 60, 60, 64, 68]","[13, 17, 18, 17, 21, 23, 21, 23, 18, 20, 17, 15]","[29.87, 29.84, 29.84, 29.84, 29.81, 29.81, 29....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[Fair, Fair, Fair, Fair, Windy, Windy, Windy, ...",Costal Mountain,Fire


## Checking Data

### Categorical Encoding

In [52]:
categorical_column_labels = ["Season Started", "Locations Affected", "Condition", "Type Of Location", "Y-Value"]
categorical_scalar_column_labels = ["Season Started", "Locations Affected", "Type Of Location", "Y-Value"]

In [55]:
for cl in categorical_scalar_column_labels:
    print(df[cl].value_counts())
    print()

Summer    135
Spring     75
Fall       49
Winter     46
Name: Season Started, dtype: int64

California     23
MB             19
BC             19
AB             18
Yellowknife    12
               ..
MA              1
IL              1
DC              1
Luxembourg      1
Hailey          1
Name: Locations Affected, Length: 110, dtype: int64

Forest               77
Mountains            59
Hills                16
Arctic               11
Coastal Forest       10
Coastal Mountains    10
Flat                  8
Rainforest            8
Plains                6
Grasslands            6
Costal Mountain       5
Desert                4
Swamp                 3
Coastal Hills         3
Costal Forest         3
Coastal Lowland       2
Coastal               1
Costal Mountains      1
Coastal Mountain      1
Mounatins             1
Name: Type Of Location, dtype: int64

Fire        155
No Fire      93
Not Fire     57
Name: Y-Value, dtype: int64



### Adding Vectors

In [56]:
vector_columns = df[vector_column_labels]

In [57]:
# Verifying all are ndarray
# is_ndarray = vector_columns.map(lambda x: isinstance(x, np.ndarray))
# is_ndarray.all().all()
is_ndarray = vector_columns.applymap(lambda x: isinstance(x, np.ndarray))
is_ndarray.all().all()

True

In [58]:
desired_shape = (12,)

# shape = vector_columns.map(lambda x: x.shape)
# # shape == desired_shape  # INVALID SYNTAX
# is_desired_shape = shape.map(lambda x: x == desired_shape)

shape = vector_columns.applymap(lambda x: x.shape)
# shape == desired_shape  # INVALID SYNTAX
is_desired_shape = shape.applymap(lambda x: x == desired_shape)

# Shorter version
#is_desired_shape = vector_columns.map(lambda x: x.shape == desired_shape)
is_desired_shape = vector_columns.applymap(lambda x: x.shape == desired_shape)

# Boolean series containing whether all vectors in that row are of the desired shape
valid_shape_rows = is_desired_shape.all(axis=1)

# Whether all vectors are of desired shape
is_desired_shape.all().all()

True

## Generating Stats

Generate minimum, average, and maximum values for specified columns

In [49]:
# Columns to generate stats for
columns_to_process = numerical_vector_column_labels

# Function to compute min, avg, and max and return as a Series
def compute_stats(arr):
    return pd.Series([np.min(arr), np.mean(arr), np.max(arr)], index=['min', 'avg', 'max'])

# New DataFrame to store results
num_stats_df = pd.DataFrame()
array_stats_df = pd.DataFrame()

# Iterate over specified columns
for col in df.columns:
    # If the column contains numpy arrays and is in the columns to process
    if col in columns_to_process:  # and np.issubdtype(df[col].dtype, np.ndarray)
        # Compute statistics for each numpy array element in the column
        stats = df[col].apply(compute_stats)
        # Rename columns to include the statistics
        stats.columns = [f"{col}_min", f"{col}_avg", f"{col}_max"]
        # Concatenate the statistics columns with the original column and insert them into the new DataFrame
        #num_stats_df = pd.concat([num_stats_df, df[col], stats], axis=1)
        array_stats_df = pd.concat([df[col]], axis=1)
        num_stats_df = pd.concat([num_stats_df, stats], axis=1)
    else:
        # If not a numpy array column or not in columns to process, copy it to the new DataFrame
        num_stats_df[col] = df[col]

num_stats_df.head(10)
array_stats_df.head(10)


Unnamed: 0,Percipitation (in)
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
6,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
9,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


## Data Preparation

### Dealing with missing values

Checking element values

In [None]:
# Only select for columns where we care if values are missing (do not care if we are missing Fire Name)
isna = df.loc[:, "Year":"Y-Value"].isna()
isna.sum()[isna.any()]
print(df.isna().sum())

# print(df['Type Of Location'].dtype)
# df = df.fillna('NaaN')
# df.drop(df[df['Type Of Location'] == 'NaaN'].index, inplace = True)
# print(df['Type Of Location'])

: 

Checking contents for numpy arrays (numerical only)

In [None]:
not df[numerical_vector_column_labels].map(lambda x: np.any(np.isnan(x))).any().any()

: 

### Dealing with categorical data

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

: 

In [None]:
unencoded_df = df.copy(deep=True)

mapping_dict = {'BC': 'British Columbia', 'YT': 'Yukon', 'QC': 'Quebec', 'AB': 'Alberta', 'MB': 'Manitoba', 'ON': 'Ontario', 'CA': 'California'}
def pain(x) -> str:
    # create a temporary list to hold the replaced strings
    temp = []
    for word in str(x).split():
        temp.append(mapping_dict.get(word, word))
     
    # join the temporary list to create the final output string
    res = " ".join(temp)
    # print(temp, res)
    return str(res)

# Columns containing single categorical values
for cl in categorical_scalar_column_labels:
    if cl == 'Locations Affected':
        # print(unencoded_df['Locations Affected'].value_counts())
        df[cl] = df[cl].apply(pain)
        pass
    df[cl] = le.fit_transform(df[cl])

print(df['Condition'])

# Columns containing categorical vector values
for cl in categorical_vector_column_labels:
    le.fit(np.concatenate(df['Condition'].values))
    # le.fit_transform(df['Condition'])
    df[cl] = df[cl].apply(le.transform)
# print(df['Y-Value'].value_counts())
print(unencoded_df['Y-Value'].value_counts())
print(unencoded_df['Locations Affected'].value_counts())

: 

### Dealing with Outliers

TBD

### Partitioning a data set

Cross Validation

In [None]:
# Replace y value
with pd.option_context('future.no_silent_downcasting', True):
    fire_map = {'Fire': 1, 'Fire\r\n': 1, 'No Fire': 0, 'Not Fire': 0}
    coded_df = pd.DataFrame()
    coded_df['Y-Value_encoded'] = unencoded_df['Y-Value'].replace(fire_map).astype(int)
    coded_df['Y-Value_encoded'].value_counts()

: 

In [None]:
# Replace seasons
# a = unencoded_df['Season Started'].unique()
# replace_dict = {k:v for v, k in enumerate(a)}
# print(replace_dict)
# coded_df['Season Started_enc'] = unencoded_df['Season Started'].replace(replace_dict)
coded_df['Season Started_enc'] = df['Season Started']
coded_df['Season Started_enc'].value_counts()

: 

In [None]:
# Locations Affected
# print(df['Locations Affected'].value_counts())
b = df['Locations Affected'].unique()
print(len(b))
# replace_dict = {k:v for v, k in enumerate(b)}
# print(replace_dict)
# coded_df['Locations Affected_enc'] = unencoded_df['Locations Affected'].replace(replace_dict)
coded_df['Locations Affected_enc'] = df['Locations Affected']
coded_df['Locations Affected_enc'].value_counts()

: 

In [None]:
coded_df['Type Of Location_enc'] = df['Type Of Location']
coded_df['Type Of Location_enc'].value_counts()

: 

In [None]:
def str_to_list(x):
    test = ''
    if type(x) == type(test):
        values = x.replace(',','').replace('[','').replace(']','').split(' ')
        values = list(filter(None, values))
    else:
        values = x
    # Remove empty strings
    l = [float(val) for val in values]
    return l

def arr_to_intlist(x):
    l = [int(val) for val in x]
    return l

list_columns = numerical_vector_column_labels

added_cols = []

# First split numerical columns
for col in list_columns:
    if (df[col].dtype == 'object'):
        df[col] = df[col].apply(str_to_list if col != 'Condition' else arr_to_intlist)
    
    new_columns = [col + str(x) for x in range(12)]
    added_cols = added_cols + new_columns
    coded_df[new_columns] = pd.DataFrame(df[col].tolist(), columns=new_columns)
    print(coded_df[new_columns])


: 

In [None]:
from sklearn.model_selection import train_test_split

target_var = 'Y-Value_encoded'
feature_var = ['Season Started_enc', 'Locations Affected_enc', 'Type Of Location_enc'] + added_cols
X_train, X_test, y_train, y_test = train_test_split(coded_df[feature_var], coded_df[target_var], test_size=0.2, shuffle=True, random_state=101)

: 

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import RepeatedKFold

repeatkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)

loo = LeaveOneOut()

kfold = KFold(n_splits=10, shuffle=True, random_state=6969)

model = GaussianNB()
model.fit(X_train, y_train) 

prediction = model.predict(X_test)

print('CV accuracy scores are', cross_val_score(model, X_train, y_train, scoring='accuracy', cv= kfold, n_jobs=-1))

print('The average KFold scores is', np.mean(cross_val_score(model, X_train, y_train, scoring='accuracy', cv=kfold, n_jobs=-1)))
print('The average LOOCV score is', np.mean(cross_val_score(model, X_train, y_train, scoring='accuracy', cv=loo, n_jobs=-1)))
print('The average RepeatedKFold score is', np.mean(cross_val_score(model, X_train, y_train, cv=repeatkf, n_jobs=-1)))

: 

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Set parameter to be searched in a range
params = {'n_neighbors': range(1,150)}

# Initiate the KNN model and GridSearchCV function
knn = KNeighborsClassifier()
grid_knn = GridSearchCV(estimator=knn, param_grid=params,
                        scoring='accuracy', cv=5)

# Fit the function to train set 
grid_knn.fit(X_train, y_train)

# Find the best parameter and see how well it performs on test set
print(grid_knn.best_params_)
print(grid_knn.score(X_test, y_test))
print(grid_knn.cv_results_['mean_test_score'])

: 

Bootstrapping

In [None]:
from sklearn.utils import resample
from sklearn.metrics import accuracy_score

accuracy = []
n_iterations = 1000

for i in range(n_iterations):
    X_sparse, y_sparse = resample(X_test, y_test, replace=True)
    predict = model.predict(X_sparse)
    score = accuracy_score(y_sparse, predict)
    accuracy.append(score)

: 

In [None]:
# Find median
median = np.median(accuracy)

# Find lower and upper bounds
lower = np.percentile(accuracy, 2.5)
upper = np.percentile(accuracy, 97.5)

print(f'The median is {median:.2f} '
      f'with confidence intervals of [{lower:.2f}, {upper:.2f}].')

: 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.kdeplot(accuracy)
plt.title('1000 Bootstrap Samples of Test Set')
plt.xlabel('Accuracy of GaussianNB')
plt.axvline(median, 0, 16, linestyle="--")
plt.axvline(lower, 0, 16, linestyle="--", color="red")
plt.axvline(upper, 0, 16, linestyle="--", color="red")
plt.show()

: 

### Feature scaling

TBD

### Data visualization

TBD