In [None]:
import pandas as pd
# Set max columns to display
pd.set_option('display.max_columns', None)

import numpy as np
from sklearn import preprocessing as pre
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score



In [None]:

# # Importing CSV files
# df_CDunit = pd.read_csv('https://raw.githubusercontent.com/saust1/Project-OptiC4/main/1%20Preprocess/Continuous%20Data/cont_554Data_clean.csv')
# df_AlCon = pd.read_csv('https://raw.githubusercontent.com/saust1/Project-OptiC4/main/1%20Preprocess/Continuous%20Data/cont_425Data_clean.csv')
# df_FB554 = pd.read_csv('https://raw.githubusercontent.com/saust1/Project-OptiC4/main/1%20Preprocess/Continuous%20Data/cont_unitData_clean.csv')


In [None]:
# # Importing CSV files
# bordeCode directory
df_CDunit = pd.read_csv(r'C:\Users\austinsh\Project-OptiC4\II Data\2 Preprocessing\Continuous Data\cont_unitData_clean.csv')
df_AlCon = pd.read_csv(r'C:\Users\austinsh\Project-OptiC4\II Data\2 Preprocessing\Continuous Data\cont_425Data_clean.csv')
df_FB554 = pd.read_csv(r'C:\Users\austinsh\Project-OptiC4\II Data\2 Preprocessing\Continuous Data\cont_554Data_clean.csv')


In [None]:
print(df_CDunit.describe())
print(df_AlCon.describe())
print(df_FB554.describe())


In [None]:
print("Data type for 'Date' column in df_CDunit:", df_CDunit['Date'].dtypes)
print("Data type for 'Date' column in df_FB554:", df_FB554['Date'].dtypes)
print("Data type for 'Date' column in df_AlCon:", df_AlCon['Date'].dtypes)


In [None]:
df_CDunit['Date'] = pd.to_datetime(df_CDunit['Date'], errors='coerce')
df_FB554['Date'] = pd.to_datetime(df_FB554['Date'], errors='coerce')
df_AlCon['Date'] = pd.to_datetime(df_AlCon['Date'], errors='coerce')


In [None]:
print("Data type for 'Date' column in df_CDunit:", df_CDunit['Date'].dtypes)
print("Data type for 'Date' column in df_FB554:", df_FB554['Date'].dtypes)
print("Data type for 'Date' column in df_AlCon:", df_AlCon['Date'].dtypes)


In [None]:
print(df_CDunit.columns)
print(df_FB554.columns)
print(df_AlCon.columns)


In [None]:
def apply_rolling_average_to_df(df, rolling_size):
    # Ensure 'Date' is the index if it's not already
    if df.index.name != 'Date':
        df = df.set_index('Date')

    # Apply rolling average to all columns
    rolled_df = df.rolling(window=rolling_size, min_periods=1).mean()

    # Reset index to make 'Date' a column again
    rolled_df = rolled_df.reset_index()

    return rolled_df



In [None]:
def apply_time_shift_by_hours(df, shift_hours):
    """
    Shifts the DataFrame's datetime index by the specified number of hours.

    :param df: DataFrame with 'Date' as its datetime index or column.
    :param shift_hours: Number of hours to shift. Can be positive (forward) or negative (backward).
    :return: Shifted DataFrame.
    """
    # Convert 'Date' to datetime and set as index if it's not already
    if df.index.name != 'Date':
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.set_index('Date')

    # Ensure the index is a DatetimeIndex
    df.index = pd.to_datetime(df.index)

    # Shift the DataFrame's index by the specified number of hours
    df.index = df.index + pd.Timedelta(hours=shift_hours)

    return df

In [None]:
# # Usage Examples
# shift_hours_AlCon = 1  # Negative shift for df_AlCon (e.g., -5 hours backward)
# shift_hours_FB554 = 5   # Positive shift for df_FB554 (e.g., 5 hours forward)

# shifted_df_AlCon = apply_time_shift_by_hours(df_AlCon, shift_hours_AlCon)
# print("Shifted df_AlCon:")
# print(shifted_df_AlCon.head())

# shifted_df_FB554 = apply_time_shift_by_hours(df_FB554, shift_hours_FB554)
# print("\nShifted df_FB554:")
# print(shifted_df_FB554.head())

In [None]:
def join_df_FB554_to_df_CDunit(df_CDunit, df_FB554):
    # Ensure 'Date' columns are datetime objects and sort DataFrames
    df_CDunit['Date'] = pd.to_datetime(df_CDunit['Date'], errors='coerce')
    df_FB554['Date'] = pd.to_datetime(df_FB554['Date'], errors='coerce')

    df_CDunit = df_CDunit.dropna(subset=['Date']).sort_values('Date')
    df_FB554 = df_FB554.dropna(subset=['Date']).sort_values('Date')

    # Perform merge_asof
    combined_df = pd.merge_asof(df_FB554, df_CDunit, on='Date', direction='nearest')

    return combined_df

def join_df_AlCon_to_combined_df(combined_df, df_AlCon):
    # Ensure 'Date' columns are datetime objects and sort DataFrames
    combined_df['Date'] = pd.to_datetime(combined_df['Date'], errors='coerce')
    df_AlCon['Date'] = pd.to_datetime(df_AlCon['Date'], errors='coerce')

    combined_df = combined_df.dropna(subset=['Date']).sort_values('Date')
    df_AlCon = df_AlCon.dropna(subset=['Date']).sort_values('Date')

    # Perform merge_asof
    combined_df_all = pd.merge_asof(df_AlCon, combined_df, on='Date', direction='nearest')
    
    return combined_df_all


In [None]:
print("Data type for 'Date' column in df_CDunit:", df_CDunit['Date'].dtypes)
print("Data type for 'Date' column in df_FB554:", df_FB554['Date'].dtypes)
print("Data type for 'Date' column in df_AlCon:", df_AlCon['Date'].dtypes)



In [None]:
def process_data_limited():
    # Apply the specific rolling average directly
    rolled_df_CDunit = apply_rolling_average_to_df(df_CDunit, 8)
    rolled_df_FB554 = apply_rolling_average_to_df(df_FB554, 4)
    rolled_df_AlCon = apply_rolling_average_to_df(df_AlCon, 2)

    # Apply the specific time shifts directly
    rolled_df_AlCon_shifted = apply_time_shift_by_hours(rolled_df_AlCon, -1) # Assuming apply_time_shift_by_hours handles negative shifts correctly
    rolled_df_FB554_shifted = apply_time_shift_by_hours(rolled_df_FB554, 1)

    # Combine df_CDunit and df_FB554 to create combined_df
    combined_df = join_df_FB554_to_df_CDunit(rolled_df_CDunit, rolled_df_FB554_shifted)

    # Combine combined_df with rolled_df_AlCon to create combined_df_all
    combined_df_all = join_df_AlCon_to_combined_df(combined_df, rolled_df_AlCon_shifted)

    # At this point, combined_df_all is the DataFrame with the data processed by the specified shifts and averages
    return combined_df_all


In [None]:
# Make sure all your helper functions and initial DataFrames (df_CDunit, df_FB554, df_AlCon) are correctly defined

# Now, call the modified process_data function to get the processed DataFrame
final_dataset = process_data_limited()

# Inspect the final_dataset
print(final_dataset.head())  # Print the first few rows to inspect the dataset


In [None]:
# def apply_negative_shift_hours(shift_hours):
#     return [-hour for hour in shift_hours]

# # Make sure to have your DataFrames: df_CDunit, df_FB554, df_AlCon ready

# # Apply the specific rolling average directly
# rolled_df_CDunit = apply_rolling_average_to_df(df_CDunit, 8)
# rolled_df_FB554 = apply_rolling_average_to_df(df_FB554, 4)
# rolled_df_AlCon = apply_rolling_average_to_df(df_AlCon, 2)

# # Apply the specific time shifts directly
# # Note: Since you're applying a negative shift to AlCon and a positive shift to FB554, ensure this logic is correctly handled in your apply_time_shift_by_hours function
# rolled_df_AlCon_shifted = apply_time_shift_by_hours(rolled_df_AlCon, -1) # apply_negative_shift_hours already applied in specifying -1
# rolled_df_FB554_shifted = apply_time_shift_by_hours(rolled_df_FB554, 1)

# # Merge the rolled and shifted DataFrames accordingly
# combined_df = join_df_FB554_to_df_CDunit(rolled_df_CDunit, rolled_df_FB554_shifted)
# combined_df_all = join_df_AlCon_to_combined_df(combined_df, rolled_df_AlCon_shifted)

# # Now, combined_df_all is the DataFrame you're interested in






In [None]:
# def apply_negative_shift_hours(shift_hours):
#     return [-hour for hour in shift_hours]

# # Rolling sizes ranges
# rolling_size_CDunit = [8]  # Even rolling sizes from 4 to 10 range(4, 11, 2)
# rolling_size_FB554 = [4]  # Even rolling sizes from 4 to 10 range(4, 11, 2)
# rolling_size_AlCon = [2]  # Even rolling sizes from 2 to 30 range(2, 31, 2) 

# # Shift hours ranges
# shift_hours_AlCon = apply_negative_shift_hours([1])  # Negative shifts from -2 to -8 (range(2, 9, 2)) 
# shift_hours_FB554 = ([1])    # Positive shifts from 2 to 8 range(2, 9, 2)

# # Precompute rolling averages for each DataFrame and each rolling size
# precomputed_rolls = {
#     "CDunit": {size: apply_rolling_average_to_df(df_CDunit, size) for size in rolling_size_CDunit},
#     "FB554": {size: apply_rolling_average_to_df(df_FB554, size) for size in rolling_size_FB554},
#     "AlCon": {size: apply_rolling_average_to_df(df_AlCon, size) for size in rolling_size_AlCon}
# }

# results = pd.DataFrame()

# ## Modified process_data function
# def process_data():
#     iteration_count = 0
#     results = pd.DataFrame(columns=['Iteration', 'Rolling Sizes CDunit', 'Rolling Sizes FB554', 'Rolling Sizes AlCon',
#                                     'Shift Hours AlCon', 'Shift Hours FB554', 'R-squared', 'Adj R-Squared', 
#                                     'F-statistic', 'Prob (F-statistic)'])

#     for size_CDunit in rolling_size_CDunit:
#         for size_FB554 in rolling_size_FB554:
#             for size_AlCon in rolling_size_AlCon:
#                 for shift_hour_AlCon in shift_hours_AlCon:
#                     for shift_hour_FB554 in shift_hours_FB554:
#                         iteration_count += 1

#                         # Retrieve rolled dataframes
#                         rolled_df_CDunit = precomputed_rolls["CDunit"][size_CDunit]
#                         rolled_df_FB554 = precomputed_rolls["FB554"][size_FB554]
#                         rolled_df_AlCon = precomputed_rolls["AlCon"][size_AlCon]

#                         # Combine df_CDunit and df_FB554 to create combined_df
#                         combined_df = join_df_FB554_to_df_CDunit(rolled_df_CDunit, rolled_df_FB554)

#                         # Combine combined_df with rolled_df_AlCon to create combined_df_all
#                         combined_df_all = join_df_AlCon_to_combined_df(combined_df, rolled_df_AlCon)

#                         # Drop 'Date' column before modeling
#                         combined_df_all = combined_df_all.drop(columns=['Date'], errors='ignore')

#                         # Splitting into train and test
#                         X = combined_df_all.drop('Butanol', axis=1)
#                         y = combined_df_all['Butanol']
#                         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#                         # Train model
#                         model = sm.OLS(y_train, X_train).fit()

#                         # Store the results instead of printing
#                         iteration_results = {
#                             'Iteration': iteration_count,
#                             'Rolling Sizes CDunit': size_CDunit,
#                             'Rolling Sizes FB554': size_FB554,
#                             'Rolling Sizes AlCon': size_AlCon,
#                             'Shift Hours AlCon': shift_hour_AlCon,
#                             'Shift Hours FB554': shift_hour_FB554,
#                             'R-squared': model.rsquared,
#                             'Adj R-Squared': model.rsquared_adj,
#                             'F-statistic': model.fvalue,
#                             'Prob (F-statistic)': model.f_pvalue
#                         }
#                         # 1 results = results.append(iteration_results, ignore_index=True)?
#                         # 2 results = pd.concat([results, iteration_results], ignore_index=True)
#                         # iteration_results_df = pd.DataFrame([iteration_results])
#                         # results = pd.concat([results], ignore_index=True) #, iteration_results_df
#                         iteration_results_df = pd.DataFrame([iteration_results])
#                         results = pd.concat([results, iteration_results_df], ignore_index=True)



#                         # Print only the iteration count
#                         print(f"Iteration: {iteration_count}")

#     return results

# # Call the function to process and evaluate the data
# model_results = process_data()

In [None]:
model_results

In [None]:
# Save DataFrame to CSV file in the same directory as the Jupyter Notebook
# model_results.to_csv('merged_data'.csv', index=False)
                     
# Save DataFrame to CSV file in the same directory as the Jupyter Notebook
# df_CD.to_csv(r'C:\Users\steve\OneDrive\1. BAIUTEK\Project-OptiC4\1 Preprocess\Continuous Data\contData_all.csv', index=False)            

model_results.to_csv(r'C:\Users\austinsh\Project-OptiC4\II Data\2 Preprocessing\Merge Data\merged_data.csv', index=False)


In [None]:
from datetime import datetime

# Get the current date and time
current_date_time = datetime.now()

# Print the current date and time
print(current_date_time)
