In [3]:
import pandas as pd
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from prettytable import PrettyTable
import statsmodels.api as sm

file = '/home/ramma/assessment-solution-april/data/palm_ffb.csv'

def load_data(file):
    df = pd.read_csv(file, delimiter=',', parse_dates=['Date'], dayfirst=True)

    df.head()
    return df 

def preprocess_dataframe(processed_data, numeric_columns):
    """
    Preprocess the input DataFrame by performing the following steps:
    1. Convert 'Date' column to datetime format.
    2. Convert 'Working_days' column to float.
    3. Drop duplicate rows.
    4. Replace infinite values with NaN.
    5. Drop rows with any missing values.
    6. Identify and remove outliers in specified numeric columns.
    
    Parameters:
    data (pd.DataFrame): The input DataFrame to preprocess.
    numeric_columns (list of str): The list of column names to identify and remove outliers.
    
    Returns:
    pd.DataFrame: The preprocessed DataFrame.
    """
   
    
    # Convert 'Date' column to datetime format
    if 'Date' in processed_data.columns:
        processed_data['Date'] = pd.to_datetime(processed_data['Date'], format='%d.%m.%Y')
    
    # Convert 'Working_days' column to float
    if 'Working_days' in processed_data.columns:
        processed_data.loc[:, 'Working_days'] = processed_data['Working_days'].astype(float)
    
    # Drop duplicate rows
    processed_data = processed_data.drop_duplicates()
    
    # Replace infinite values with NaN
    processed_data.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Drop rows with any missing values
    processed_data.dropna(inplace=True)

    # Identify and remove outliers in specified numeric columns
    for column in numeric_columns:
        if column in data.columns:
            Q1 = processed_data[column].quantile(0.25)
            Q3 = processed_data[column].quantile(0.75)
            IQR = Q3 - Q1
            # Remove outliers
            processed_data = processed_data[~((processed_data[column] < (Q1 - 1.5 * IQR)) | (processed_data[column] > (Q3 + 1.5 * IQR)))]

    return processed_data


def multiple_linear_regression(df):
    # Define features and target variable
    X = df[['SoilMoisture', 'Average_Temp', 'Min_Temp', 'Precipitation', 'Working_days']]
    y = df['FFB_Yield']

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Fit model using original features
    X_scaled = sm.add_constant(X_scaled)
    model = sm.OLS(y, X_scaled).fit()

    # Create a PrettyTable object
    table = PrettyTable()
    table.field_names = ["Variable", "Coefficient", "Standard Error", "t-Value", "p-Value"]

    # Populate the table with model parameters
    for i, variable in enumerate(model.params.index):
        table.add_row([
            variable,
            f"{model.params[variable]:.4f}",
            f"{model.bse[variable]:.4f}",
            f"{model.tvalues[variable]:.4f}",
            f"{model.pvalues[variable]:.4f}"
        ])

    # Add R-squared and Adjusted R-squared
    table.add_row(["R-squared", f"{model.rsquared:.4f}", "", "", ""])
    table.add_row(["Adjusted R-squared", f"{model.rsquared_adj:.4f}", "", "", ""])

    return table

# Call the function and display the results
data = load_data(file)
numeric_columns = [col for col in data.columns if pd.api.types.is_numeric_dtype(data[col])]
preprocessed_data = preprocess_dataframe(data, numeric_columns)
summary_table = multiple_linear_regression(preprocessed_data)
print("Model Summary:")
print(summary_table)

Model Summary:
+--------------------+-------------+----------------+---------+---------+
|      Variable      | Coefficient | Standard Error | t-Value | p-Value |
+--------------------+-------------+----------------+---------+---------+
|       const        |    1.6150   |     0.0230     | 70.2245 |  0.0000 |
|         x1         |   -0.0898   |     0.0321     | -2.7938 |  0.0061 |
|         x2         |   -0.0070   |     0.0297     | -0.2342 |  0.8153 |
|         x3         |   -0.0565   |     0.0267     | -2.1124 |  0.0368 |
|         x4         |    0.1649   |     0.0305     |  5.4168 |  0.0000 |
|         x5         |    0.0111   |     0.0234     |  0.4742 |  0.6363 |
|     R-squared      |    0.2169   |                |         |         |
| Adjusted R-squared |    0.1831   |                |         |         |
+--------------------+-------------+----------------+---------+---------+
