# LADCA-INCAT-ENVECON: Regression Analysis

## Table of Contents <a class="anchor" id="top"></a>
[1. Import Used Packages and Functions](#section_1)<br>
[2. Data Loading and Cleaning](#section_2)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[2.1 Data Loading](#section_2_1)<br>
[3. Data Exploration](#section_3)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[3.1 Feature Information](#section_3_1)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[3.2 Feature Renaming and Dropping](#section_3_2)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[3.3 *activity* and *animal_ID* Feature Visualisation](#section_3_3)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[3.4 Missing Data Visualisation](#section_3_4)<br>
[4. Data Pre-processing](#section_4)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[4.1 Train-Test Split Preparation](#section_4_1)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[4.2 Train-Test Split](#section_4_2)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[4.3 Missing Data Imputation](#section_4_3)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[4.4 Outlier Detection and Removal](#section_4_4)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[4.5 Novelty Detection and Removal](#section_4_5)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[4.6 Majority Undersampling](#section_4_6)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[4.7 Minority Oversampling](#section_4_7)<br>
[5. Data Fitting](#section_5)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[5.1 Support Vector Machine Classification](#section_5_1)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[5.2 Random Forest Classification](#section_5_2)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[5.3 Quadratic Discriminant Analysis Classification](#section_5_3)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[5.4 k-Nearest Neigbours Classification](#section_5_4)<br>
[6. Dimensionality Reduction](#section_6)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[6.1 Principal Component Analysis (Linear Feature Extraction, Unsupervised)](#section_6_1)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[6.2 Linear Discriminant Analysis (Linear Feature Extraction, Supervised)](#section_6_2)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[6.3 Locally Linear Embedding (Non-Linear Feature Extraction, Unspervised)](#section_6_3)<br>

<h2 style="display: inline"> 1. Import Used Packages and Functions </h2> <span style="float: right"><a href="#top">[back to top]</a></span> <a class="anchor" id="section_1"></a>

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

<h2 style="display: inline"> 2. Data Loading and Cleaning </h2> <span style="float: right"><a href="#top">[back to top]</a></span> <a class="anchor" id="section_2"></a>

<h2 style="display: inline"> 2.1 Data Loading </h2> <span style="float: right"><a href="#top">[back to top]</a></span> <a class="anchor" id="section_2_1"></a> 

In [6]:
# set the folder
folder_path = r'N:\FWET\FDCH\AdsCatal\General\personal_work_folders\plasmacatdesign\drm\ugent'

# set the reaction data file name and path
file_path_reaction = os.path.join(
    folder_path,
    'ugent_drm_reaction_data_combined.parquet'
)

# read the reaction data file
df_reaction = pd.read_parquet(file_path_reaction)

# set the electrical data file name and path
file_path_electrical = os.path.join(
    folder_path,
    'ugent_drm_electrical_data_combined.parquet'
)

# read the electrical data file
df_electrical = pd.read_parquet(file_path_electrical)

<h2 style="display: inline"> 2.2 Data Filtering </h2> <span style="float: right"><a href="#top">[back to top]</a></span> <a class="anchor" id="section_2_2"></a> 

In [7]:
# Preparing the reaction data

# Subset the data by selecting the rows in which res_time_sec
# is between 10 and 70 (inclusive)
df_reaction = df_reaction[
    (df_reaction['res_time_sec'] >= 10) &
    (df_reaction['res_time_sec'] <= 70)
].reset_index(drop=True)

# Substitue the values in the column 'material' with custom values
df_reaction['material'] = df_reaction['material'].map(
    {
        'SASOL 1.8': 'γ-Al₂O₃',
        '2% Cu(II)(NO3)2 @ SASOL 1.8': '2% CuO@γ-Al₂O₃',
        '10% Cu(II)(NO3)2 @ SASOL 1.8': '10% CuO@γ-Al₂O₃',
        '2% Fe(III)Citrate @ SASOL 1.8': '2% Fe₂O₃@γ-Al₂O₃',
        '10% Fe(III)Citrate @ SASOL 1.8': '10% Fe₂O₃@γ-Al₂O₃'
    }
)

# Preparing the electrical data

# subset the data
df_electrical = df_electrical[
    (df_electrical['residence_time_s'] >= 10) &
    (df_electrical['residence_time_s'] <= 70) &
    (df_electrical['plasma_state'] == 'plasma')
].reset_index(drop=True)

df_electrical['material_name'] = df_electrical['material_name'].map(
    {
        'sasol-1.8-c450': 'γ-Al₂O₃',
        'sasol-1.8-cuo-02%': '2% CuO@γ-Al₂O₃',
        'sasol-1.8-cuo-10%': '10% CuO@γ-Al₂O₃',
        'sasol-1.8-fe2o3-02%': '2% Fe₂O₃@γ-Al₂O₃',
        'sasol-1.8-fe2o3-10%': '10% Fe₂O₃@γ-Al₂O₃'
    }
)

df_electrical = df_electrical[
    df_electrical.groupby(['material_name', 'residence_time_s'])['measurement_number'].transform("max") == df_electrical['measurement_number']
].drop(columns=["measurement_number"]).reset_index(drop=True)

# rename some columns in the df_reaction dataframe
df_reaction.rename(
    columns={
        'material': 'material_name',
        'res_time_sec': 'residence_time_s'
    },
    inplace=True
)

<h2 style="display: inline"> 2.3 Data Merging </h2> <span style="float: right"><a href="#top">[back to top]</a></span> <a class="anchor" id="section_2_3"></a> 

In [None]:
# merge the reaction and electrical dataframes
df_combined = pd.merge(
    df_reaction,
    df_electrical,
    on=['material_name', 'residence_time_s']
).assign(
    sei_ideal_kjmol=lambda x: round(
        number=30/((1-0.4687)*(17.31/x['residence_time_s']))*24.055,
        ndigits=-1
    ).astype(int),
    compound=lambda x: x['compound'].map(
        {
            'CO2': 'CO₂',
            'H2': 'H₂',
            'CO': 'CO',
            'O2': 'O₂',
            'CH4': 'CH₄',
            'C2H6': 'C₂H₆',
            'C2H4': 'C₂H₄',
            'C2H2': 'C₂H₂',
            'C3H8': 'C₃H₈',
            'CH3OH': 'CH₃OH',
            'C2H5OH': 'C₂H₅OH',
            'CH3OCH3': 'CH₃OCH₃',
            'Total' : 'Total'
        }
    )
).reset_index(drop=True)

print(df_combined.columns)