In [1]:
pip install "numpy<2" matplotlib 

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
# Enable inline plotting for Jupyter Notebook
%matplotlib inline

In [3]:
# Define the dataset folder path (as you did)
folder_path = "Freiwald_Tsao_faceviews_AM_data_csv"

# Use glob to find all CSV files in the folder
# The pattern ensures you only get files ending with .csv
all_csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

# Check if any files were found
if not all_csv_files:
    print(f"Error: No CSV files found in the folder: {folder_path}")
else:
    print(f"Found {len(all_csv_files)} CSV files to load.")

    # Create an empty list to hold the individual DataFrames
    list_of_dfs = []

    # Loop through the list of found CSV files
    for file_path in all_csv_files:
        try:
            # Read each CSV file into a DataFrame
            df_single = pd.read_csv(file_path)
            # Optional: Add a column to know which file the data came from
            df_single['source_file'] = os.path.basename(file_path)
            # Append the DataFrame to the list
            list_of_dfs.append(df_single)
        except Exception as e:
            print(f"Error loading file {file_path}: {e}")

    # Check if any DataFrames were successfully loaded
    if not list_of_dfs:
        print("No dataframes were loaded successfully.")
    else:
        # Concatenate all DataFrames in the list into a single DataFrame
        combined_df = pd.concat(list_of_dfs, axis=0, ignore_index=True)

        # Display info about the combined DataFrame
        print("\n--- Combined DataFrame ---")
        print(f"Total rows: {len(combined_df)}")
        print(combined_df.info())
        print("\nFirst 5 rows of combined data:")
        print(combined_df.head())
        # Optional: Check unique values in key columns again
        # print("\nUnique persons in combined data:", combined_df['labels.person'].unique())
        # print("Unique orientations in combined data:", combined_df['labels.orientation'].unique())
        # (Run this right after creating combined_df)
        print("Checking unique orientations in the originally loaded combined_df:")
        print(combined_df['labels.orientation'].unique())
        print("\nValue counts for orientation:")
        print(combined_df['labels.orientation'].value_counts())

Found 193 CSV files to load.

--- Combined DataFrame ---
Total rows: 206216
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206216 entries, 0 to 206215
Columns: 807 entries, site_info.monkey to source_file
dtypes: int64(802), object(5)
memory usage: 1.2+ GB
None

First 5 rows of combined data:
  site_info.monkey site_info.region  labels.stimID  labels.person  \
0             bert               am              1              1   
1             bert               am              1              1   
2             bert               am              1              1   
3             bert               am              1              1   
4             bert               am              1              1   

  labels.orientation labels.orient_person_combo  time.1_2  time.2_3  time.3_4  \
0              front                    front 1         0         0         0   
1              front                    front 1         0         0         0   
2              front                    front

In [4]:
# (Run this in your Jupyter Notebook - Cell after loading combined_df)
import pandas as pd
import numpy as np

# --- Define Orientation to Angle Mapping ---
# Using the confirmed unique orientations
unique_orientations = combined_df['labels.orientation'].unique()
print(f"Unique orientations confirmed: {unique_orientations}")

# Define the COMPLETE mapping
orientation_to_angle = {
    'front': 0.0,
    'left 3/4': -45.0,
    'left profile': -90.0,
    'right 3/4': 45.0,
    'right profile': 90.0,
    'up': 0.0,       # Treating as 0 angle - adjust if needed
    'down': 0.0,     # Treating as 0 angle - adjust if needed
    'back': 180.0    # Adjust if needed
}
# Ensure all found orientations are in the map
orientation_to_angle = {k: v for k, v in orientation_to_angle.items() if k in unique_orientations}
print(f"\nUsing angle mapping: {orientation_to_angle}")

# --- Create the target variable column ---
# Apply mapping directly to the original combined_df
combined_df['orientation_angle'] = combined_df['labels.orientation'].map(orientation_to_angle)

# --- Handle potential NaNs (though unlikely now) and select target Y ---
original_rows = len(combined_df)
combined_df.dropna(subset=['orientation_angle'], inplace=True)
rows_after_drop = len(combined_df)
if (original_rows - rows_after_drop) > 0:
    print(f"\nWarning: Dropped {original_rows - rows_after_drop} rows with NaN angles.")

# Define the target variable Y
Y_angle = combined_df['orientation_angle'].copy()
print(f"\nTarget variable 'orientation_angle' created. Shape: {Y_angle.shape}")
print(Y_angle.describe())
print(f"\nAngle value counts:\n{Y_angle.value_counts().sort_index()}")

Unique orientations confirmed: ['front' 'left 3/4' 'left profile' 'right 3/4' 'right profile' 'up' 'down'
 'back']

Using angle mapping: {'front': 0.0, 'left 3/4': -45.0, 'left profile': -90.0, 'right 3/4': 45.0, 'right profile': 90.0, 'up': 0.0, 'down': 0.0, 'back': 180.0}

Target variable 'orientation_angle' created. Shape: (206216,)
count    206216.000000
mean         22.919415
std          78.129978
min         -90.000000
25%           0.000000
50%           0.000000
75%          90.000000
max         180.000000
Name: orientation_angle, dtype: float64

Angle value counts:
-90.0     25698
-45.0     25648
 0.0      77028
 45.0     25658
 90.0     26160
 180.0    26024
Name: orientation_angle, dtype: int64


In [5]:
# (Run this in your Jupyter Notebook - Cell after angle mapping)
# Assumes 'combined_df' has 'orientation_angle' and Y_angle exists

print("--- Engineering Features: Firing Rates in Bins ---")

# --- Define Time Bins ---
bin_size_ms = 50  # 50ms bins
window_ms = 400   # First 400ms
n_bins = window_ms // bin_size_ms
bin_edges = np.arange(0, window_ms + 1, bin_size_ms)
bin_duration_s = bin_size_ms / 1000.0
print(f"Using {n_bins} bins of size {bin_size_ms}ms.")

# --- Identify Time Columns ---
time_cols_pattern = [f'time.{i}_{i+1}' for i in range(1, window_ms)]
time_cols_pattern = [col for col in time_cols_pattern if col in combined_df.columns]

if not time_cols_pattern:
    print("Error: Could not find relevant time columns.")
else:
    # Select spike data corresponding to rows with valid angles (using Y_angle.index)
    spike_data = combined_df.loc[Y_angle.index, time_cols_pattern]

    # --- Calculate Rates ---
    rate_feature_names = [f'rate_bin_{i*bin_size_ms}_{(i+1)*bin_size_ms}ms' for i in range(n_bins)]
    X_patterns = pd.DataFrame(index=spike_data.index, columns=rate_feature_names, dtype=float)

    print("Calculating firing rates per bin...")
    for i in range(n_bins):
        start_col_idx = bin_edges[i]
        end_col_idx = bin_edges[i+1]
        cols_in_bin = [f'time.{t}_{t+1}' for t in range(start_col_idx + 1, end_col_idx + 1)]
        cols_in_bin = [col for col in cols_in_bin if col in spike_data.columns]

        if cols_in_bin:
            X_patterns[rate_feature_names[i]] = spike_data[cols_in_bin].sum(axis=1) / bin_duration_s
        else:
            X_patterns[rate_feature_names[i]] = 0.0

    print("\nFiring rate features calculated.")
    print(f"Shape of pattern features X_patterns: {X_patterns.shape}")
    # print(X_patterns.head()) # Optional: view features

    # Ensure X and Y are finally aligned
    X_patterns.dropna(inplace=True) # Drop rows if any calculation failed
    Y_angle = Y_angle.loc[X_patterns.index]
    print(f"Final aligned shapes: X={X_patterns.shape}, Y={Y_angle.shape}")

    # Optional memory cleanup
    # import gc
    # del spike_data
    # gc.collect()

--- Engineering Features: Firing Rates in Bins ---
Using 8 bins of size 50ms.
Calculating firing rates per bin...

Firing rate features calculated.
Shape of pattern features X_patterns: (206216, 8)
Final aligned shapes: X=(206216, 8), Y=(206216,)


In [6]:
# (Run this in your Jupyter Notebook - Cell after feature engineering)
# Assumes X_patterns and Y_angle exist and are aligned

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

print("\n--- Preparing Data: Splitting and Scaling ---")

# Split data
X_train_pat, X_test_pat, Y_train_ang, Y_test_ang = train_test_split(
    X_patterns, Y_angle, test_size=0.3, random_state=42
)
print(f"Split data: X_train={X_train_pat.shape}, X_test={X_test_pat.shape}")

# Scale features
print("Scaling features using StandardScaler...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_pat)
X_test_scaled = scaler.transform(X_test_pat)
print("Scaling complete.")


--- Preparing Data: Splitting and Scaling ---
Split data: X_train=(144351, 8), X_test=(61865, 8)
Scaling features using StandardScaler...
Scaling complete.


In [7]:
# (Run this in your Jupyter Notebook - Cell after split/scale)
# Assumes X_train_scaled, Y_train_ang, X_test_scaled, Y_test_ang exist

from sklearn.linear_model import RidgeCV
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

print("\n--- Training Ridge Regression (Predicting Angle from Patterns) ---")
alphas_ridge = [1, 10, 100, 1000, 10000, 100000] # Wider range for scaled data
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Use R2 scoring, as MAE/MSE depend on angle units (degrees)
ridge_cv_angle = RidgeCV(alphas=alphas_ridge, cv=kf, scoring='r2')
ridge_cv_angle.fit(X_train_scaled, Y_train_ang)

best_alpha_ridge_angle = ridge_cv_angle.alpha_
print(f"Best alpha found: {best_alpha_ridge_angle}")
print(f"Best CV R2 score: {ridge_cv_angle.best_score_:.4f}")

# Evaluate on Test Set
Y_pred_ridge_angle = ridge_cv_angle.predict(X_test_scaled)
mse_ridge_angle = mean_squared_error(Y_test_ang, Y_pred_ridge_angle)
mae_ridge_angle = mean_absolute_error(Y_test_ang, Y_pred_ridge_angle)
r2_ridge_angle = r2_score(Y_test_ang, Y_pred_ridge_angle)

print("\n--- Ridge Regression (Angle Prediction - Test Data Eval) ---")
print(f"Test MSE: {mse_ridge_angle:.4f}")
print(f"Test MAE: {mae_ridge_angle:.4f} (degrees, approx)")
print(f"Test R-squared: {r2_ridge_angle:.4f}") # <-- This is the key performance metric


--- Training Ridge Regression (Predicting Angle from Patterns) ---
Best alpha found: 10000
Best CV R2 score: 0.0013

--- Ridge Regression (Angle Prediction - Test Data Eval) ---
Test MSE: 6091.9345
Test MAE: 62.0856 (degrees, approx)
Test R-squared: 0.0016


In [9]:
# --- Cell: Lasso Regression (Predicting Angle from Patterns) - CORRECTED ---
# (Run this in your Jupyter Notebook)
# Assumes X_train_scaled, Y_train_ang, X_test_scaled, Y_test_ang exist
# Assumes X_patterns DataFrame exists for feature names

from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np

print("\n--- Training Lasso Regression (Predicting Angle from Patterns) ---")
alphas_lasso = [0.001, 0.01, 0.1, 1.0, 10.0, 20.0] # Adjust range if needed
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# *** CORRECTION: Removed scoring='r2' argument ***
lasso_cv_angle = LassoCV(alphas=alphas_lasso, cv=kf, random_state=42, max_iter=5000, n_jobs=-1, verbose=0)

lasso_cv_angle.fit(X_train_scaled, Y_train_ang)

best_alpha_lasso_angle = lasso_cv_angle.alpha_
print(f"Best alpha found: {best_alpha_lasso_angle}")

# Evaluate on Test Set
Y_pred_lasso_angle = lasso_cv_angle.predict(X_test_scaled)
mse_lasso_angle = mean_squared_error(Y_test_ang, Y_pred_lasso_angle)
mae_lasso_angle = mean_absolute_error(Y_test_ang, Y_pred_lasso_angle)
r2_lasso_angle = r2_score(Y_test_ang, Y_pred_lasso_angle)

print("\n--- Lasso Regression (Angle Prediction - Test Data Eval) ---")
print(f"Test MSE: {mse_lasso_angle:.4f}")
print(f"Test MAE: {mae_lasso_angle:.4f} (degrees, approx)")
print(f"Test R-squared: {r2_lasso_angle:.4f}") # <-- Key performance metric

# Examine Coefficients (Weights on firing rates in different time bins)
print("\n--- Lasso Coefficients (Angle Prediction) ---")
try:
    # Assumes X_patterns is the dataframe used to generate X_train_pat/X_test_pat before scaling
    coefficients_lasso_angle = pd.Series(lasso_cv_angle.coef_, index=X_patterns.columns)
    num_nonzero_coeffs_angle = (coefficients_lasso_angle != 0).sum()
    print(f"Number of features used (non-zero coefficients): {num_nonzero_coeffs_angle} out of {len(coefficients_lasso_angle)}")
    # print("Non-zero coefficients:")
    # print(coefficients_lasso_angle[coefficients_lasso_angle != 0].sort_values())
except NameError:
    print("Could not display coefficients (original feature names not found).")
except Exception as e:
    print(f"An error occurred displaying coefficients: {e}")
    print(f"An error occurred displaying coefficients: {e}")


--- Training Lasso Regression (Predicting Angle from Patterns) ---
Best alpha found: 0.1

--- Lasso Regression (Angle Prediction - Test Data Eval) ---
Test MSE: 6091.9527
Test MAE: 62.0870 (degrees, approx)
Test R-squared: 0.0016

--- Lasso Coefficients (Angle Prediction) ---
Number of features used (non-zero coefficients): 6 out of 8


In [10]:
# --- Cell: Tuned Random Forest (Predicting Angle from Patterns) ---
# (Run this in your Jupyter Notebook)
# Assumes X_train_scaled, Y_train_ang, X_test_scaled, Y_test_ang exist

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from scipy.stats import randint
import numpy as np # Ensure numpy is imported if running standalone

print("\n--- Training Tuned Random Forest (Predicting Angle from Patterns) ---")
# Define parameter distribution
param_dist = {
    'n_estimators': randint(50, 200),        # Number of trees
    'max_depth': [10, 15, 20, 25, 30, None], # Test specific depths + None
    'min_samples_split': randint(5, 30),     # Min samples to split (adjusted)
    'min_samples_leaf': randint(3, 20)       # Min samples per leaf (adjusted)
}
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

# Initialize Randomized Search
# Adjust n_iter or cv if needed for your system's performance
random_search_angle = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=15, # Number of parameter settings sampled (adjust as needed)
    cv=3,      # Use 3-fold CV (adjust as needed)
    scoring='r2', # Optimize for R-squared
    random_state=42,
    n_jobs=-1, # Use all cores for CV folds
    verbose=1  # Show progress
)

# Fit Randomized Search on the scaled training data
print("Starting Randomized Search for Random Forest...")
random_search_angle.fit(X_train_scaled, Y_train_ang)

print(f"Best parameters found: {random_search_angle.best_params_}")
print(f"Best CV R-squared score: {random_search_angle.best_score_:.4f}")

# Evaluate the best estimator found on the test set
best_rf_model_angle = random_search_angle.best_estimator_
Y_pred_rf_tuned_angle = best_rf_model_angle.predict(X_test_scaled)
r2_rf_tuned_angle = r2_score(Y_test_ang, Y_pred_rf_tuned_angle)
mse_rf_tuned_angle = mean_squared_error(Y_test_ang, Y_pred_rf_tuned_angle)
mae_rf_tuned_angle = mean_absolute_error(Y_test_ang, Y_pred_rf_tuned_angle)

print("\n--- Tuned Random Forest (Angle Prediction - Test Data Eval) ---")
print(f"Test MSE: {mse_rf_tuned_angle:.4f}")
print(f"Test MAE: {mae_rf_tuned_angle:.4f} (degrees, approx)")
print(f"Test R-squared: {r2_rf_tuned_angle:.4f}") # <-- Key performance metric

# Optional: Feature Importances
# import pandas as pd
# try:
#     importances_rf_angle = pd.Series(best_rf_model_angle.feature_importances_, index=X_patterns.columns)
#     print("\n--- Random Forest Feature Importances (Angle Prediction) ---")
#     print("Top Features:")
#     print(importances_rf_angle.sort_values(ascending=False).head(10))
# except NameError:
#      print("\nCould not display feature importances (original feature names not found).")


--- Training Tuned Random Forest (Predicting Angle from Patterns) ---
Starting Randomized Search for Random Forest...
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Best parameters found: {'max_depth': None, 'min_samples_leaf': 17, 'min_samples_split': 23, 'n_estimators': 157}
Best CV R-squared score: -0.0015

--- Tuned Random Forest (Angle Prediction - Test Data Eval) ---
Test MSE: 6101.4221
Test MAE: 61.9841 (degrees, approx)
Test R-squared: 0.0001


In [None]:
# --- Cell: SVR with Randomized Search (Predicting Angle from Patterns) ---
# (Run this in your Jupyter Notebook)
# Assumes X_train_scaled, Y_train_ang, X_test_scaled, Y_test_ang exist

from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from scipy.stats import uniform, loguniform # For parameter distributions
import numpy as np # Ensure numpy is imported

print("\n--- Training Tuned SVR (Predicting Angle from Patterns) ---")

# Define parameter distribution for Randomized Search
# Using RBF kernel by default, common for SVR
param_dist_svr = {
    'C': loguniform(1e-1, 1e3),      # Regularization parameter (log scale)
    'gamma': loguniform(1e-4, 1e-1), # Kernel coefficient (log scale) - sensitive!
    'epsilon': uniform(0.01, 0.5)    # Margin of tolerance
}
svr = SVR(kernel='rbf') # Specify RBF kernel

# Initialize Randomized Search
# Adjust n_iter or cv if needed for your system's performance
# SVR tuning can be slow! Start with low n_iter.
random_search_svr = RandomizedSearchCV(
    estimator=svr,
    param_distributions=param_dist_svr,
    n_iter=10, # Low number of iterations due to potential slowness
    cv=3,      # 3-fold CV
    scoring='r2', # Optimize for R-squared
    random_state=42,
    n_jobs=-1, # Use all cores for CV folds
    verbose=1  # Show progress
)

# Fit Randomized Search on the scaled training data
print("Starting Randomized Search for SVR...")
random_search_svr.fit(X_train_scaled, Y_train_ang)

print(f"Best parameters found: {random_search_svr.best_params_}")
print(f"Best CV R-squared score: {random_search_svr.best_score_:.4f}")

# Evaluate the best estimator found on the test set
best_svr_model = random_search_svr.best_estimator_
Y_pred_svr_tuned = best_svr_model.predict(X_test_scaled)
r2_svr_tuned = r2_score(Y_test_ang, Y_pred_svr_tuned)
mse_svr_tuned = mean_squared_error(Y_test_ang, Y_pred_svr_tuned)
mae_svr_tuned = mean_absolute_error(Y_test_ang, Y_pred_svr_tuned)

print("\n--- Tuned SVR (Angle Prediction - Test Data Eval) ---")
print(f"Test MSE: {mse_svr_tuned:.4f}")
print(f"Test MAE: {mae_svr_tuned:.4f} (degrees, approx)")
print(f"Test R-squared: {r2_svr_tuned:.4f}") # <-- Key performance metric


--- Training Tuned SVR (Predicting Angle from Patterns) ---
Starting Randomized Search for SVR...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
