In [18]:
import pandas as pd
# Set max columns to display
pd.set_option('display.max_columns', None)

import numpy as np
from sklearn import preprocessing as pre
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score



In [19]:

# # Importing CSV files
# df_CDunit = pd.read_csv('https://raw.githubusercontent.com/saust1/Project-OptiC4/main/1%20Preprocess/Continuous%20Data/cont_554Data_clean.csv')
# df_AlCon = pd.read_csv('https://raw.githubusercontent.com/saust1/Project-OptiC4/main/1%20Preprocess/Continuous%20Data/cont_425Data_clean.csv')
# df_FB554 = pd.read_csv('https://raw.githubusercontent.com/saust1/Project-OptiC4/main/1%20Preprocess/Continuous%20Data/cont_unitData_clean.csv')


In [20]:
# # Importing CSV files
# bordeCode directory
df_CDunit = pd.read_csv(r'C:\Users\austinsh\Project-OptiC4\IV Optimize\Continuous Data\cont_unitData_clean.csv')
df_AlCon = pd.read_csv(r'C:\Users\austinsh\Project-OptiC4\IV Optimize\Continuous Data\cont_425Data_clean.csv')
df_FB554 = pd.read_csv(r'C:\Users\austinsh\Project-OptiC4\IV Optimize\Continuous Data\cont_554Data_clean.csv')


In [21]:
print(df_CDunit.describe())
print(df_AlCon.describe())
print(df_FB554.describe())


            DI55102       DI55152       FC55003       FC55009       FC55552  \
count  48806.000000  48806.000000  48806.000000  48806.000000  48806.000000   
mean       0.944831      0.933324   5995.076190    847.340213  36160.195195   
std        0.052697      0.029800    851.268879    614.602209   4676.181585   
min        0.800007      0.837341   2861.120000      0.000000  18205.900000   
25%        0.910894      0.913208   5483.547500    294.673500  35010.300000   
50%        0.950356      0.933329   6015.470000    837.947500  37970.900000   
75%        0.984626      0.952618   6556.990000   1327.720000  39030.775000   
max        1.061510      1.025180   9134.330000   2674.790000  52000.000000   

            FC55569       FC55576      FFC55555       LC55557       LC90366  \
count  48806.000000  48806.000000  48806.000000  48806.000000  48806.000000   
mean    6608.439755    368.052255      0.772328     69.428048     46.226414   
std      381.543221    256.912595      0.022550    

In [22]:
print("Data type for 'Date' column in df_CDunit:", df_CDunit['Date'].dtypes)
print("Data type for 'Date' column in df_FB554:", df_FB554['Date'].dtypes)
print("Data type for 'Date' column in df_AlCon:", df_AlCon['Date'].dtypes)


Data type for 'Date' column in df_CDunit: object
Data type for 'Date' column in df_FB554: object
Data type for 'Date' column in df_AlCon: object


In [23]:
df_CDunit['Date'] = pd.to_datetime(df_CDunit['Date'], errors='coerce')
df_FB554['Date'] = pd.to_datetime(df_FB554['Date'], errors='coerce')
df_AlCon['Date'] = pd.to_datetime(df_AlCon['Date'], errors='coerce')


In [24]:
print("Data type for 'Date' column in df_CDunit:", df_CDunit['Date'].dtypes)
print("Data type for 'Date' column in df_FB554:", df_FB554['Date'].dtypes)
print("Data type for 'Date' column in df_AlCon:", df_AlCon['Date'].dtypes)


Data type for 'Date' column in df_CDunit: datetime64[ns]
Data type for 'Date' column in df_FB554: datetime64[ns]
Data type for 'Date' column in df_AlCon: datetime64[ns]


In [25]:
print(df_CDunit.columns)
print(df_FB554.columns)
print(df_AlCon.columns)


Index(['Date', 'DI55102', 'DI55152', 'FC55003', 'FC55009', 'FC55552',
       'FC55569', 'FC55576', 'FFC55555', 'LC55557', 'LC90366', 'LC90368',
       'PI55004', 'PI55020', 'TC55552', 'TI55021'],
      dtype='object')
Index(['Date', 'Butanol'], dtype='object')
Index(['Date', '425_pct_Al', 'C4_pct_Eth', 'C4_pct_H2O', 'HydWtr_pct_Ammonia',
       'HydWtr_Na2O'],
      dtype='object')


In [26]:
def apply_rolling_average_to_df(df, rolling_size):
    # Ensure 'Date' is the index if it's not already
    if df.index.name != 'Date':
        df = df.set_index('Date')

    # Apply rolling average to all columns
    rolled_df = df.rolling(window=rolling_size, min_periods=1).mean()

    # Reset index to make 'Date' a column again
    rolled_df = rolled_df.reset_index()

    return rolled_df



In [27]:
def apply_time_shift_by_hours(df, shift_hours):
    """
    Shifts the DataFrame's datetime index by the specified number of hours.

    :param df: DataFrame with 'Date' as its datetime index or column.
    :param shift_hours: Number of hours to shift. Can be positive (forward) or negative (backward).
    :return: Shifted DataFrame.
    """
    # Convert 'Date' to datetime and set as index if it's not already
    if df.index.name != 'Date':
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.set_index('Date')

    # Ensure the index is a DatetimeIndex
    df.index = pd.to_datetime(df.index)

    # Shift the DataFrame's index by the specified number of hours
    df.index = df.index + pd.Timedelta(hours=shift_hours)

    return df

In [28]:
# # Usage Examples
# shift_hours_AlCon = 1  # Negative shift for df_AlCon (e.g., -5 hours backward)
# shift_hours_FB554 = 5   # Positive shift for df_FB554 (e.g., 5 hours forward)

# shifted_df_AlCon = apply_time_shift_by_hours(df_AlCon, shift_hours_AlCon)
# print("Shifted df_AlCon:")
# print(shifted_df_AlCon.head())

# shifted_df_FB554 = apply_time_shift_by_hours(df_FB554, shift_hours_FB554)
# print("\nShifted df_FB554:")
# print(shifted_df_FB554.head())

In [29]:
def join_df_FB554_to_df_CDunit(df_CDunit, df_FB554):
    # Reset index if 'Date' is the index
    if df_CDunit.index.name == 'Date':
        df_CDunit = df_CDunit.reset_index()
    if df_FB554.index.name == 'Date':
        df_FB554 = df_FB554.reset_index()

    # Ensure 'Date' columns are datetime objects and sort DataFrames
    df_CDunit['Date'] = pd.to_datetime(df_CDunit['Date'], errors='coerce')
    df_FB554['Date'] = pd.to_datetime(df_FB554['Date'], errors='coerce')

    df_CDunit = df_CDunit.dropna(subset=['Date']).sort_values('Date')
    df_FB554 = df_FB554.dropna(subset=['Date']).sort_values('Date')

    # Perform merge_asof
    combined_df = pd.merge_asof(df_FB554, df_CDunit, on='Date', direction='nearest')

    return combined_df

def join_df_AlCon_to_combined_df(combined_df, df_AlCon):
    # Reset index if 'Date' is the index
    if combined_df.index.name == 'Date':
        combined_df = combined_df.reset_index()
    if df_AlCon.index.name == 'Date':
        df_AlCon = df_AlCon.reset_index()

    # Ensure 'Date' columns are datetime objects and sort DataFrames
    combined_df['Date'] = pd.to_datetime(combined_df['Date'], errors='coerce')
    df_AlCon['Date'] = pd.to_datetime(df_AlCon['Date'], errors='coerce')

    combined_df = combined_df.dropna(subset=['Date']).sort_values('Date')
    df_AlCon = df_AlCon.dropna(subset=['Date']).sort_values('Date')

    # Perform merge_asof
    combined_df_all = pd.merge_asof(df_AlCon, combined_df, on='Date', direction='nearest')
    
    return combined_df_all


In [30]:
print("Data type for 'Date' column in df_CDunit:", df_CDunit['Date'].dtypes)
print("Data type for 'Date' column in df_FB554:", df_FB554['Date'].dtypes)
print("Data type for 'Date' column in df_AlCon:", df_AlCon['Date'].dtypes)



Data type for 'Date' column in df_CDunit: datetime64[ns]
Data type for 'Date' column in df_FB554: datetime64[ns]
Data type for 'Date' column in df_AlCon: datetime64[ns]


In [31]:
def process_data_limited():
    # Apply the specific rolling average directly
    rolled_df_CDunit = apply_rolling_average_to_df(df_CDunit, 8)
    rolled_df_FB554 = apply_rolling_average_to_df(df_FB554, 4)
    rolled_df_AlCon = apply_rolling_average_to_df(df_AlCon, 2)

    # Apply the specific time shifts directly
    rolled_df_AlCon_shifted = apply_time_shift_by_hours(rolled_df_AlCon, -1) # Assuming apply_time_shift_by_hours handles negative shifts correctly
    rolled_df_FB554_shifted = apply_time_shift_by_hours(rolled_df_FB554, 1)

    # Combine df_CDunit and df_FB554 to create combined_df
    combined_df = join_df_FB554_to_df_CDunit(rolled_df_CDunit, rolled_df_FB554_shifted)

    # Combine combined_df with rolled_df_AlCon to create combined_df_all
    combined_df_all = join_df_AlCon_to_combined_df(combined_df, rolled_df_AlCon_shifted)

    # At this point, combined_df_all is the DataFrame with the data processed by the specified shifts and averages
    return combined_df_all


In [32]:
# Make sure all your helper functions and initial DataFrames (df_CDunit, df_FB554, df_AlCon) are correctly defined

# Now, call the modified process_data function to get the processed DataFrame
final_dataset = process_data_limited()

# Inspect the final_dataset
print(final_dataset.head())  # Print the first few rows to inspect the dataset


                 Date  425_pct_Al  C4_pct_Eth  C4_pct_H2O  HydWtr_pct_Ammonia  \
0 2012-05-16 15:00:00    6.306930    2.505710    21.95810            0.909887   
1 2012-05-16 16:00:00    6.307280    2.507065    21.96075            0.909326   
2 2012-05-16 18:00:00    6.308335    2.511135    21.96865            0.907643   
3 2012-05-16 19:00:00    6.309390    2.515210    21.97655            0.905959   
4 2012-05-16 20:00:00    6.310090    2.517925    21.98185            0.904837   

   HydWtr_Na2O    Butanol   DI55102   DI55152    FC55003   FC55009   FC55552  \
0     2.618940  58.526500  0.970151  0.924718  4968.4400  1523.030  41476.45   
1     2.665560  58.526500  0.970151  0.924718  4968.4400  1523.030  41476.45   
2     2.805415  56.692400  0.970151  0.924718  4968.4400  1523.030  41476.45   
3     2.945270  56.692400  0.970151  0.924718  4968.4400  1523.030  41476.45   
4     3.038505  53.635533  0.970441  0.923875  5205.8625  1517.235  41490.25   

    FC55569    FC55576  FFC55555

In [33]:
# Save DataFrame to CSV file in the same directory as the Jupyter Notebook
# model_results.to_csv('merged_data'.csv', index=False)
                     
# Save DataFrame to CSV file in the same directory as the Jupyter Notebook
# df_CD.to_csv(r'C:\Users\steve\OneDrive\1. BAIUTEK\Project-OptiC4\1 Preprocess\Continuous Data\contData_all.csv', index=False)            

final_dataset.to_csv(r'C:\Users\austinsh\Project-OptiC4\IV Optimize\Merge Data\merged_data.csv', index=False)


In [34]:
from datetime import datetime

# Get the current date and time
current_date_time = datetime.now()

# Print the current date and time
print(current_date_time)


2024-03-25 13:03:26.929972
