In [5]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
file_path = r'C:\Users\Lalitha\OneDrive\Desktop\project_spatial_datamining-main\data\cleaned\wq_lakes.csv' 
df = pd.read_csv(file_path)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Lalitha\\OneDrive\\Desktop\\project_spatial_datamining-main\\data\\cleaned\\wq_lakes.csv'

In [None]:
print(df.head())

In [None]:
columns_for_pca = [
    '100923 PH (FIELD) pH units',
    '80558 OXYGEN DISSOLVED (FIELD METER) mg/L',
    '100924 SPECIFIC CONDUCTANCE (FIELD) uS/cm',
    '100925 TEMPERATURE WATER deg C',
    'Elevation',
    'Slope',
    'LC_1km',
    'LC_5km',
    'NDVI',
    'Temperature',
    'Precipitation',
    'RP_count'
]

df_pca = df[columns_for_pca]

In [None]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_pca)

In [None]:
pca = PCA(n_components= None)
pca_result = pca.fit_transform(df_scaled)

In [None]:
pca_df = pd.DataFrame(data=pca_result, columns=[f'PC{i+1}' for i in range(pca_result.shape[1])])

# Show the PCA DataFrame
print(pca_df.head())

In [None]:
explained_variance = pca.explained_variance_ratio_

In [None]:
# Plot the explained variance
plt.figure(figsize=(8, 6))
plt.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.7, color='blue')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.title('Explained Variance by Principal Components')
plt.xticks(range(1, len(explained_variance) + 1))
plt.show()

We can neglect from PC5 since The higher the bar, the more significant the PC is.

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x=pca_df['PC1'], y=pca_df['PC2'])
plt.title('PCA: First vs Second Principal Component')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

In [None]:
import numpy as np

# Get the PCA loadings (eigenvectors)
loadings = pca.components_[:2]  # Extract PC1 and PC2 loadings

# Create a DataFrame with feature contributions
loading_df = pd.DataFrame(loadings.T, index=columns_for_pca, columns=['PC1', 'PC2'])

# Display the contributions of each feature
print(loading_df)

In [None]:
# Scatter plot of the third vs fourth principal components
plt.figure(figsize=(8, 6))
sns.scatterplot(x=pca_df['PC3'], y=pca_df['PC4'])
plt.title('PCA: Third vs Fourth Principal Component')
plt.xlabel('PC3')
plt.ylabel('PC4')
plt.show()

In [None]:
# Get the loading vectors for all principal components
loadings = pd.DataFrame(pca.components_.T, 
                        columns=[f'PC{i+1}' for i in range(pca.n_components_)], 
                        index=columns_for_pca)

# Display loadings for PC3 and PC4
print("Feature Contributions to PC3 and PC4:")
print(loadings[['PC3', 'PC4']])


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from adjustText import adjust_text  # For preventing overlap

# Perform PCA (assuming df_scaled is your standardized dataset)
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df_scaled)

# Get feature loadings (eigenvectors)
loadings = pca.components_.T

# Create figure
fig, ax = plt.subplots(figsize=(8, 6))

# Scatter plot of data points (optional)
ax.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.2)

# Add feature vectors
feature_names = columns_for_pca  # List of column names used in PCA
texts = []  # To store text labels for adjustment

for i, feature in enumerate(feature_names):
    ax.arrow(0, 0, loadings[i, 0], loadings[i, 1], 
             head_width=0.05, head_length=0.05, color='red')
    text = ax.text(loadings[i, 0] * 1.1, loadings[i, 1] * 1.1, feature, 
                   color='black', fontsize=10)
    texts.append(text)

# Adjust text labels to prevent overlap
adjust_text(texts, arrowprops=dict(arrowstyle='-', color='gray', lw=0.5))

# Labels and Title
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_title("PCA Biplot (Feature Loadings)")
ax.axhline(y=0, color='black', linestyle='--', linewidth=0.5)
ax.axvline(x=0, color='black', linestyle='--', linewidth=0.5)
plt.grid()

# Show plot
plt.show()


Inference: 
1. Significant variables from the dataset are: Dissolved Oxygen, Water temperature,specific conductance and elevation.
2. More clusters in the centre, referring to few outliers.
3. Temperature water and specific conductance seem to be positively correlated.
4. Oxygen dissolved is in different direction, so negatively correlated with temperature water. 

TRYING RANDOM FOREST REGRESSOR ON PC1 AND PC2 FEATURES OBTAINED FROM PCA

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

target = df['100923 PH (FIELD) pH units']  
X_pca = pca_result[:, :2]  # PC1 and PC2
X_train, X_test, y_train, y_test = train_test_split(X_pca, target, test_size=0.2, random_state=42)


In [None]:
# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on PCA-transformed data
model.fit(X_train, y_train)


In [None]:
# Predict on test set
y_pred = model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")


TRYING RANDOM FOREST REGRESSOR WITH ORIGINAL FEATURES

In [None]:
target = df['100923 PH (FIELD) pH units'] 

In [None]:
features = df.drop(columns=['100923 PH (FIELD) pH units']) 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
# Identify columns with non-numeric values
non_numeric_cols = df.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_cols)

In [None]:
df_cleaned = df.drop(columns=non_numeric_cols)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

X_train['StationNumber'] = label_encoder.fit_transform(X_train['StationNumber'])
X_train['RiverSubBasinCode'] = label_encoder.fit_transform(X_train['RiverSubBasinCode'])

In [None]:
X_train = pd.get_dummies(X_train, columns=['StationNumber', 'RiverSubBasinCode'])


In [None]:
# Convert to datetime format
X_train['SampleDatetime'] = pd.to_datetime(X_train['SampleDatetime'], errors='coerce')

# Extract year, month, and day (or other time-based features as needed)
X_train['Year'] = X_train['SampleDatetime'].dt.year
X_train['Month'] = X_train['SampleDatetime'].dt.month
X_train['Day'] = X_train['SampleDatetime'].dt.day
X_train['Hour'] = X_train['SampleDatetime'].dt.hour


In [None]:
# Check for missing values
print(X_train.isna().sum())

In [None]:
# Convert 'SampleDatetime' to datetime type
X_train['SampleDatetime'] = pd.to_datetime(X_train['SampleDatetime'])

# Extract year, month, day, hour, etc.
X_train['Year'] = X_train['SampleDatetime'].dt.year
X_train['Month'] = X_train['SampleDatetime'].dt.month
X_train['Day'] = X_train['SampleDatetime'].dt.day
X_train['Hour'] = X_train['SampleDatetime'].dt.hour

# Drop the original 'SampleDatetime' column
X_train = X_train.drop(columns=['SampleDatetime'])

# Check the data types of the features
print(X_train.dtypes)

# Now you can train the model
rf_model.fit(X_train, y_train)


In [None]:
print(X_train.columns)
print(X_test.columns)


In [None]:
# Align X_test columns with X_train
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

X_test = X_test.drop(columns=['SampleDatetime', 'StationNumber'], errors='ignore')

# Re-train and predict
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)


In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
