In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
dataframe = pd.read_csv('RRCA_baseflow.csv')

In [3]:
dataframe['Date'] = dataframe['Date'].astype(int)-693963

In [None]:
display(dataframe)

In [5]:
dataframe['Date'] = pd.to_datetime(dataframe['Date'], origin='1899-12-30', unit='D')

In [6]:
dataframe['Month'] = dataframe['Date'].dt.month

In [None]:
display(dataframe)

In [None]:
# Cre
variables = ['x', 'y', 'Evapotranspiration', 'Precipitation', 'Irrigation_pumping']
for i, var in enumerate(variables):
    for month in range(1, 13):
        monthly_data = dataframe[dataframe['Month'] == month]

        fig, axs = plt.subplots(5, 1, figsize=(10, 20))

        for i, var in enumerate(variables):
            axs[i].scatter(monthly_data[var], monthly_data['Observed'])
            axs[i].set_title(f'Observed vs {var} for Month {month}')
            axs[i].set_xlabel(var)
            axs[i].set_ylabel('Observed')

        plt.tight_layout()
        plt.show()
    axs[i].set_ylabel('Observed')

plt.tight_layout()
plt.show()

# Plot baseflow over time

In [None]:
plt.figure(figsize=(10, 6))
for month in range(1, 13):
    monthly_data = dataframe[dataframe['Month'] == month]
    plt.bar(monthly_data['Month'], monthly_data['Observed'])
plt.title('Observed Baseflow Over Time')
plt.xlabel('Date')
plt.ylabel('Observed Baseflow')
plt.show()

In [None]:
from scipy import stats
month_p_value = {
    'Evapotranspiration': [],
    'Precipitation': [],
    'Irrigation_pumping': []
}
for month in range(1, 13):
    monthly_data = dataframe[dataframe['Month'] == month]
    # month_p_value['x'].append(stats.pearsonr(monthly_data['Observed'], monthly_data['x'])[1])
    # month_p_value['y'].append(stats.pearsonr(monthly_data['Observed'], monthly_data['y'])[1])
    month_p_value['Evapotranspiration'].append(stats.pearsonr(monthly_data['Observed'], monthly_data['Evapotranspiration'])[1])
    month_p_value['Precipitation'].append(stats.pearsonr(monthly_data['Observed'], monthly_data['Precipitation'])[1])
    month_p_value['Irrigation_pumping'].append(stats.pearsonr(monthly_data['Observed'], monthly_data['Irrigation_pumping'])[1])
    print("-----------------------Month {}------------------------------".format(month))
    display(stats.pearsonr(monthly_data['Observed'], monthly_data['x']))
    display(stats.pearsonr(monthly_data['Observed'], monthly_data['y']))
    display(stats.pearsonr(monthly_data['Observed'], monthly_data['Evapotranspiration']))
    display(stats.pearsonr(monthly_data['Observed'], monthly_data['Precipitation']))
    display(stats.pearsonr(monthly_data['Observed'], monthly_data['Irrigation_pumping']))

In [11]:
month_p_value = pd.DataFrame(month_p_value)
month_p_value['month'] = list(range(1, 13))
month_p_value.drop(columns='Irrigation_pumping', inplace=True)

In [None]:
month_p_value.plot(x='month', kind='bar')
# Setting plot titles and labels
plt.title('P-values for Different Features Across 12 Months')
plt.xlabel('Month')
plt.ylabel('P-value')
plt.legend(title='Features')

# Show the plot
plt.show()

In [None]:
variables_to_correlate = ['x', 'y', 'Evapotranspiration', 'Precipitation', 'Irrigation_pumping', 'Observed']

for month in range(1, 13):
    monthly_data = dataframe[dataframe['Month'] == month]
    
    correlation_matrix = monthly_data[variables_to_correlate].corr()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
    plt.title(f'Correlation matrix for Month {month}')
    plt.show()
    
    print(f"Correlation matrix for Month {month}:")
    display(correlation_matrix)

In [None]:
from sklearn.feature_selection import f_regression

variables_to_correlate = ['x', 'y', 'Evapotranspiration', 'Precipitation', 'Irrigation_pumping', 'Observed']

correlation_matrix = dataframe[variables_to_correlate].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation matrix for the entire dataset')
plt.show()

display(correlation_matrix)

In [15]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
month_encoded = encoder.fit_transform(dataframe[['Month']]).toarray()
month_encoded_df = pd.DataFrame(month_encoded, columns=[f'Month_{i}' for i in range(1, 13)])
dataframe = pd.concat([dataframe, month_encoded_df], axis=1)

In [16]:
segment_id_encoded = encoder.fit_transform(dataframe[['Segment_id']]).toarray()
segment_id_encoded_df = pd.DataFrame(segment_id_encoded, columns=[f'Segment_{i}' for i in range(segment_id_encoded.shape[1])])

segment_id_encoded_df.columns = [f'Segment_{int(dataframe["Segment_id"].unique()[i])}' for i in range(segment_id_encoded.shape[1])]
segments=dataframe["Segment_id"].unique()
dataframe = pd.concat([dataframe, segment_id_encoded_df], axis=1)

In [None]:
correlation_matrix = dataframe.corr()

observed_correlations = correlation_matrix['Observed']

month_correlations = observed_correlations.filter(like='Month_')
segment_correlations = observed_correlations.filter(like='Segment_').drop('Segment_id')

significant_month_correlations = month_correlations[month_correlations.abs() > 0.03]
significant_segment_correlations = segment_correlations[segment_correlations.abs() > 0.05]

print("Significant Month Correlations with Observed:")
print(significant_month_correlations)
plt.figure(figsize=(10, 5))
month_correlations.sort_values().plot(kind='bar')
plt.title('Month Correlations with Observed')
plt.xlabel('Month')
plt.ylabel('Correlation')
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 5))
segment_correlations.sort_values().plot(kind='bar')
plt.title('Segment ID Correlations with Observed')
plt.xlabel('Segment ID')
plt.ylabel('Correlation')
plt.grid(True)
plt.show()

print("\nSignificant Segment ID Correlations with Observed:")
print(significant_segment_correlations)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

features = dataframe[['y', 'Precipitation'] + [f'Month_{i}' for i in range(1, 13)]]
features = pd.concat([features, dataframe[[f'Segment_{i}' for i in dataframe["Segment_id"].unique()]]], axis=1)

target = dataframe['Observed']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

In [None]:
month_correlation_threshold = 0.02
segment_correlation_threshold = 0.03

significant_month_columns = observed_correlations.filter(like='Month_').index[observed_correlations.filter(like='Month_').abs() > month_correlation_threshold].tolist()
significant_segment_columns = observed_correlations.filter(like='Segment_').index[observed_correlations.filter(like='Segment_').abs() > segment_correlation_threshold].tolist()

features = dataframe[['y', 'Precipitation'] + significant_month_columns + significant_segment_columns]

target = dataframe['Observed']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

In [None]:
dataframe['Month_Correlation'] = dataframe[[f'Month_{i}' for i in range(1, 13)]].dot(month_correlations.values)

display(dataframe)

In [None]:
dataframe['Segment_Correlation'] = dataframe[[f'Segment_{i}' for i in segments]].dot(segment_correlations.values)

display(dataframe)

In [None]:
features = dataframe[['y', 'Precipitation', 'Month_Correlation', 'Segment_Correlation']]

target = dataframe['Observed']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

In [None]:
features = dataframe[['y', 'Precipitation', 'Month_Correlation', 'Segment_Correlation'] + [f'Month_{i}' for i in range(1, 13)]]
features = pd.concat([features, dataframe[[f'Segment_{i}' for i in dataframe["Segment_id"].unique()]]], axis=1)
target = dataframe['Observed']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')