In [None]:
import numpy as np
import pandas as pd
import ast
import re

fname='MSCI446_ Data - Sheet1.csv'

def convert_to_numpy_array(string: str):
    # Verify string is written like a python list
    stripped = string.strip()
    pattern = r"\[(.*)\]"
    match = re.match(pattern, stripped)
    if not match:
        raise NotImplementedError(f"{stripped}")
    # Group 1 captures the content inside the square brackets
    contents = match.group(1)
    
    try:
        # Use ast.literal_eval to safely evaluate string literals
        evaluated = ast.literal_eval(stripped)
        if type(evaluated) is not list:
            raise NotImplementedError()
        # series = pd.Series(evaluated)
        return np.array(evaluated)
    except (SyntaxError, ValueError):  # this occurs with the "Condition" column
        # If parsing as a list fails, split the contents within the square brackets by comma, 
        # interpret each value as a string and strip whitespace
        return np.array([ item.strip() for item in contents.split(',') ])

# Read CSV file with the custom function
numerical_vector_column_labels = ['Temperature (F)', 'Dewpoint (F)', 'Humidity (%)', 'Wind Speed (mph)', 'Pressure (in)', 'Percipitation (in)']
categorical_vector_column_labels = ["Condition"]
vector_column_labels = numerical_vector_column_labels + categorical_vector_column_labels

# Create a converters dictionary mapping each column to the converter function
converters = {col: convert_to_numpy_array for col in vector_column_labels}
df = pd.read_csv(fname, converters=converters)
df.shape

## Verifying Data Ain't Trash

### Categorical Stuff

In [None]:
categorical_column_labels = ["Season Started", "Locations Affected", "Condition", "Type Of Location", "Y-Value"]
categorical_scalar_column_labels = ["Season Started", "Locations Affected", "Type Of Location", "Y-Value"]

In [None]:
for cl in categorical_scalar_column_labels:
    print(df[cl].value_counts())
    print()

### Vector Stuff

In [None]:
vector_columns = df[vector_column_labels]

In [None]:
# Verifying all are ndarray
is_ndarray = vector_columns.map(lambda x: isinstance(x, np.ndarray))
is_ndarray.all().all()

In [None]:
desired_shape = (12,)

shape = vector_columns.map(lambda x: x.shape)
# shape == desired_shape  # INVALID SYNTAX
is_desired_shape = shape.map(lambda x: x == desired_shape)

# Shorter version
is_desired_shape = vector_columns.map(lambda x: x.shape == desired_shape)

# Boolean series containing whether all vectors in that row are of the desired shape
valid_shape_rows = is_desired_shape.all(axis=1)

# Whether all vectors are of desired shape
is_desired_shape.all().all()

## Generating Stats

Generate minimum, average, and maximum values for specified columns

In [None]:
# Columns to generate stats for
columns_to_process = numerical_vector_column_labels

# Function to compute min, avg, and max and return as a Series
def compute_stats(arr):
    return pd.Series([np.min(arr), np.mean(arr), np.max(arr)], index=['min', 'avg', 'max'])

# New DataFrame to store results
new_df = pd.DataFrame()

# Iterate over specified columns
for col in df.columns:
    # If the column contains numpy arrays and is in the columns to process
    if col in columns_to_process:  # and np.issubdtype(df[col].dtype, np.ndarray)
        # Compute statistics for each numpy array element in the column
        stats = df[col].apply(compute_stats)
        # Rename columns to include the statistics
        stats.columns = [f"{col}_min", f"{col}_avg", f"{col}_max"]
        # Concatenate the statistics columns with the original column and insert them into the new DataFrame
        new_df = pd.concat([new_df, df[col], stats], axis=1)
    else:
        # If not a numpy array column or not in columns to process, copy it to the new DataFrame
        new_df[col] = df[col]

# print(new_df)


In [None]:
df = new_df

## Data Preparation

### 1. Dealing with missing values

Checking element values

In [None]:
# Only select for columns where we care if values are missing (do not care if we are missing Fire Name)
isna = df.loc[:, "Year":"Y-Value"].isna()
isna.sum()[isna.any()]

Checking contents for numpy arrays (numerical only)

In [None]:
not df[numerical_vector_column_labels].map(lambda x: np.any(np.isnan(x))).any().any()

### 2. Dealing with duplicates

TBD

### 3. Dealing with categorical data

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
# Columns containing single categorical values
for cl in categorical_scalar_column_labels:
    df[cl] = le.fit_transform(df[cl])

# Columns containing categorical vector values
for cl in categorical_vector_column_labels:
    le.fit(np.concatenate(df['Condition']))
    df[cl] = df[cl].apply(le.transform)

### 4. Dealing with Outliers

TBD

### 5. Partitioning a data set

TBD

### 6. Feature scaling

TBD

### 7. Data visualization

TBD