In [1]:
import pandas as pd
import numpy as np 

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
full_submission = pd.read_csv("Data/Kaggle/full_submission.csv")
test_full = pd.read_csv("Data/Kaggle/test-full.csv")


# Hydrology Engineering

In [3]:
test_full.loc[:, 'Euclidian_Distance'] = np.sqrt((test_full['Vertical_Distance_To_Hydrology'] ** 2) + (test_full['Horizontal_Distance_To_Hydrology'] ** 2))
test_full.loc[:, 'Hydrology_Ratio'] = test_full['Vertical_Distance_To_Hydrology'] / test_full['Horizontal_Distance_To_Hydrology']
test_full.loc[:, 'Hydrology_Ratio'].fillna(0, inplace=True)
# IDEA: "Horizontal_Distance_To_Hydrology" and "Vertical_Distance_To_Hydrology" are highly correlated --> Euclidean distance to hydrology.
# PRACTICE: Doesnt really seem to make sense

# Hillshade Enginnering

In [4]:
import numpy as np

# Calculate mean Hillshade
def calculate_mean_hillshade(df):
    df['Mean_Hillshade'] = df[['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']].mean(axis=1)
    return df

# Calculate median Hillshade
def calculate_median_hillshade(df):
    df['Median_Hillshade'] = df[['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']].median(axis=1)
    return df

# Calculate standard deviation of Hillshade
def calculate_std_hillshade(df):
    df['Std_Hillshade'] = df[['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']].std(axis=1)
    return df

# Calculate range of Hillshade
def calculate_range_hillshade(df):
    df['Range_Hillshade'] = df[['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']].max(axis=1) - df[['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']].min(axis=1)
    return df

# Calculate differences between Hillshade indices
def calculate_hillshade_differences(df):
    df['Diff_9am_Noon'] = df['Hillshade_9am'] - df['Hillshade_Noon']
    df['Diff_Noon_3pm'] = df['Hillshade_Noon'] - df['Hillshade_3pm']
    df['Diff_9am_3pm'] = df['Hillshade_9am'] - df['Hillshade_3pm']
    return df

test_full = calculate_mean_hillshade(test_full)
test_full = calculate_median_hillshade(test_full)
test_full = calculate_std_hillshade(test_full)
test_full = calculate_range_hillshade(test_full)
test_full = calculate_hillshade_differences(test_full)


# Soiltype Engineering

In [5]:
"""
First digit: climatic zone

1: lower montane dry: -
2: lower montane: 1-6
3: montane dry: 7, 8
4: montane: 9-13
5: montane dry and montane: 14, 15 # Should we add these two two the two previous ones each ? 
6: montane and subalpine: 16-18
7: subalpine: 19-34
8: alpine: 35-40
Second digit: geologic zones

1: alluvium: 14-17, 19-21
2: glacial: 9, 22, 23
3: shale: -
4: sandstone: -
5: mixed sedimentary: 7, 8
6: unspecified in the USFS ELU Survey: -
7: igneous and metamorphic: 1-6, 10-13, 18, 24-40
8: volcanic: -


The third and fourth ELU digits are unique to the mapping unit and have no special meaning to the climatic or geologic zones.

"""

'\nFirst digit: climatic zone\n\n1: lower montane dry: -\n2: lower montane: 1-6\n3: montane dry: 7, 8\n4: montane: 9-13\n5: montane dry and montane: 14, 15 # Should we add these two two the two previous ones each ? \n6: montane and subalpine: 16-18\n7: subalpine: 19-34\n8: alpine: 35-40\nSecond digit: geologic zones\n\n1: alluvium: 14-17, 19-21\n2: glacial: 9, 22, 23\n3: shale: -\n4: sandstone: -\n5: mixed sedimentary: 7, 8\n6: unspecified in the USFS ELU Survey: -\n7: igneous and metamorphic: 1-6, 10-13, 18, 24-40\n8: volcanic: -\n\n\nThe third and fourth ELU digits are unique to the mapping unit and have no special meaning to the climatic or geologic zones.\n\n'

In [6]:
# First digit: climatic zone
test_full.loc[:, 'lower_montane'] = test_full.loc[:, [col for col in test_full if col.endswith(tuple(f'Type{i}' for i in range (1, 7)))]].sum(axis=1)
test_full.loc[:, 'montane_dry'] = test_full.loc[:, [col for col in test_full if col.endswith(('Type7', 'Type8'))]].sum(axis=1)
test_full.loc[:, 'montane'] = test_full.loc[:, [col for col in test_full if col.endswith(tuple(f'Type{i}' for i in range (9, 14)))]].sum(axis=1)
test_full.loc[:, 'montane_dry_and_montane'] = test_full.loc[:, [col for col in test_full if col.endswith(('Type14', 'Type15'))]].sum(axis=1)
test_full.loc[:, 'montane_and_subalpine'] = test_full.loc[:, [col for col in test_full if col.endswith(('Type16', 'Type17', 'Type18'))]].sum(axis=1)
test_full.loc[:, 'subalpine'] = test_full.loc[:, [col for col in test_full if col.endswith(tuple(f'Type{i}' for i in range (19, 35)))]].sum(axis=1)
test_full.loc[:, 'alpine'] = test_full.loc[:, [col for col in test_full if col.endswith(tuple(f'Type{i}' for i in range (35, 41)))]].sum(axis=1)


In [7]:
# Second digit: geologic zones

test_full.loc[:, 'alluvium'] = test_full.loc[:, [col for col in test_full if col.endswith(tuple(f'Type{i}' for i in range (14, 8)) + tuple(f'Type{i}' for i in range (19, 21)))]].sum(axis=1)
test_full.loc[:, 'glacial'] = test_full.loc[:, [col for col in test_full if col.endswith(('Type9', 'Type22', 'Type23'))]].sum(axis=1)
test_full.loc[:, 'mixed_sedimentary'] = test_full.loc[:, [col for col in test_full if col.endswith(('Type7', 'Type8'))]].sum(axis=1)
test_full.loc[:, 'igneous_and_metamorphic'] = test_full.loc[:, [col for col in test_full if col.endswith(tuple(f'Type{i}' for i in range (1, 7)) + tuple(f'Type{i}' for i in range (10, 14)) + tuple(['Type18']) + tuple(f'Type{i}' for i in range (24, 41)))]].sum(axis=1)


# Aspect Enginnering

In [8]:

# Define the bin edges for each sector (in degrees)
bin_edges = [0,  22.5,  67.5, 112.5, 157.5, 202.5, 247.5, 292.5, 337.5, 360]

# Define labels for each sector
bin_labels = ['N1', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW', 'N2']

# Create a new column 'Aspect_Sector' to store the discretized aspect
test_full['Aspect_Sector'] = pd.cut(test_full['Aspect'], bins=bin_edges, labels=bin_labels, include_lowest=True)

# Replace 'N1' and 'N2' labels with 'N'
test_full['Aspect_Sector'].replace({'N1': 'N', 'N2': 'N'}, inplace=True)

# If you want to convert the 'Aspect_Sector' column to categorical type
test_full['Aspect_Sector'] = test_full['Aspect_Sector'].astype('category')

# Display the first few rows of the DataFrame to verify the changes
print(test_full[['Aspect', 'Aspect_Sector']].head())


   Aspect Aspect_Sector
0      51            NE
1      56            NE
2     139            SE
3     155            SE
4      45            NE


In [9]:
dummy_aspect = pd.get_dummies(test_full['Aspect_Sector'], prefix='Aspect_Sector', dtype=int)
test_full = test_full.merge(dummy_aspect, right_index=True, left_index=True)

# Aspect / Hillshade Engineering

In [10]:

def categorize_aspect(aspect):
    if 0 <= aspect <= 180:
        return 1
    elif 180 < aspect <= 360:
        return -1

    
#Bin Aspect 0-180, 180-360
def bin_aspect(df):
    df['Binned_Aspect'] = df['Aspect'].apply(categorize_aspect)
    return df

#Calculate Slope times aspect binned1 or -1 for each direction in respect to sunset and sunrising
def calculate_slope_aspect(df):
    df['Slope_Aspect'] = df['Slope'] * df['Binned_Aspect']
    return df

def calculate_hillshade_slope_Aspect(df):
    df['Hillshade_3_9_Slope_Aspect'] = df['Hillshade_3pm'] * df['Slope_Aspect'] + df['Hillshade_9am'] * df['Slope_Aspect']
    df['Hillshade_Noon_Slope_Aspect'] = df['Hillshade_Noon'] * df['Slope_Aspect']
    return df
    
def calculate_hillshade_slope(df):
    df['Hillshade_3_9_Slope'] = df['Hillshade_3pm'] * df['Slope'] + df['Hillshade_9am'] * df['Slope']
    df['Hillshade_Noon_Slope']= df['Hillshade_Noon'] * df['Slope']
    return df


test_full = bin_aspect(test_full)
test_full = calculate_slope_aspect(test_full)
test_full = calculate_hillshade_slope_Aspect(test_full)
test_full = calculate_hillshade_slope(test_full)


In [12]:
import pandas as pd
import math

# Assuming df is your DataFrame with the 'elevation' feature

# Function to generate features
def generate_features(df):
    # Feature 1: Elevation squared
    df['elevation_squared'] = df['Elevation'] ** 2
    
    # Feature 2: Elevation cubed
    df['elevation_cubed'] = df['Elevation'] ** 3
    
    # Feature 3: Log of elevation (if elevation > 0)
    df['log_elevation'] = df['Elevation'].apply(lambda x: math.log(x) if x > 0 else 0)
    
    # Feature 4: Square root of elevation (if elevation > 0)
    df['sqrt_elevation'] = df['Elevation'].apply(lambda x: math.sqrt(x) if x > 0 else 0)
    
    # Feature 5: Reciprocal of elevation (if elevation != 0)
    df['reciprocal_elevation'] = df['Elevation'].apply(lambda x: 1/x if x != 0 else 0)
    
    # Feature 6: Binary indicator for positive elevation
    df['positive_elevation'] = df['Elevation'].apply(lambda x: 1 if x > 0 else 0)
    
    # Feature 7: Binary indicator for negative elevation
    df['negative_elevation'] = df['Elevation'].apply(lambda x: 1 if x < 0 else 0)
    
    # Feature 8: Binning of elevation (example: into 5 bins)
    df['elevation_bin'] = pd.cut(df['Elevation'], bins=5, labels=False)

    # Feature 9: Binary indicator for extreme elevations (example: elevation > 95th percentile)
    percentile = 85
    df['extreme_elevation'] = (df['Elevation'] > np.percentile(df['Elevation'], percentile)).astype(int)
    
    # Feature 3: Natural logarithm of elevation (if elevation > 0)
    df['log_elevation'] = df['Elevation'].apply(lambda x: np.log(x) if x > 0 else 0)

    
    return df

# Apply feature generation function
test_full = generate_features(test_full)


  result = getattr(ufunc, method)(*inputs, **kwargs)


# Exporting Final Data

In [13]:
test_full.to_csv("Data/test_engineered.csv", index=False)