## 2. Predict Total Number of Spikes Across All Neurons

####  Task: Given the identity/orientation of the face, predict the total number of spikes during a trial.
####  Goal: Test how much a stimulus drives global activity.
####  Fun twist: Could reveal which identities cause stronger overall brain activation.



- Inputs: face identities(1-25), head orientations(1-8), encoding variables
- Outputs: total spikes (continuous)

Methods: 
1. Linear regression / multi- linear regression
2. lasso/ ridge
3. Random forest
4. PCR
Evaluation: k-fold CV

In [1]:
pip install "numpy<2" matplotlib 

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
# Enable inline plotting for Jupyter Notebook
%matplotlib inline

In [3]:
# Define the dataset folder path
folder_path = "Freiwald_Tsao_faceviews_AM_data_csv"

# List all CSV files
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Load one sample file to inspect the structure
sample_file = os.path.join(folder_path, csv_files[0])  # First CSV file
df = pd.read_csv(sample_file)

# Display info
print("Dataset Loaded:", sample_file)
print(df.head())  # Show first few rows
print(df.info())  # Column details

Dataset Loaded: Freiwald_Tsao_faceviews_AM_data_csv/raster_data_bert_am_site070.csv
  site_info.monkey site_info.region  labels.stimID  labels.person  \
0             bert               am              1              1   
1             bert               am              1              1   
2             bert               am              1              1   
3             bert               am              1              1   
4             bert               am              1              1   

  labels.orientation labels.orient_person_combo  time.1_2  time.2_3  time.3_4  \
0              front                    front 1         0         0         0   
1              front                    front 1         0         0         0   
2              front                    front 1         0         0         0   
3              front                    front 1         0         0         0   
4              front                    front 1         0         0         0   

   time.4_5  .

In [4]:
# Define the dataset folder path (as you did)
folder_path = "Freiwald_Tsao_faceviews_AM_data_csv"

# Use glob to find all CSV files in the folder
# The pattern ensures you only get files ending with .csv
all_csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

# Check if any files were found
if not all_csv_files:
    print(f"Error: No CSV files found in the folder: {folder_path}")
else:
    print(f"Found {len(all_csv_files)} CSV files to load.")

    # Create an empty list to hold the individual DataFrames
    list_of_dfs = []

    # Loop through the list of found CSV files
    for file_path in all_csv_files:
        try:
            # Read each CSV file into a DataFrame
            df_single = pd.read_csv(file_path)
            # Optional: Add a column to know which file the data came from
            df_single['source_file'] = os.path.basename(file_path)
            # Append the DataFrame to the list
            list_of_dfs.append(df_single)
        except Exception as e:
            print(f"Error loading file {file_path}: {e}")

    # Check if any DataFrames were successfully loaded
    if not list_of_dfs:
        print("No dataframes were loaded successfully.")
    else:
        # Concatenate all DataFrames in the list into a single DataFrame
        combined_df = pd.concat(list_of_dfs, axis=0, ignore_index=True)

        # Display info about the combined DataFrame
        print("\n--- Combined DataFrame ---")
        print(f"Total rows: {len(combined_df)}")
        print(combined_df.info())
        print("\nFirst 5 rows of combined data:")
        print(combined_df.head())
        # Optional: Check unique values in key columns again
        # print("\nUnique persons in combined data:", combined_df['labels.person'].unique())
        # print("Unique orientations in combined data:", combined_df['labels.orientation'].unique())

Found 193 CSV files to load.

--- Combined DataFrame ---
Total rows: 206216
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206216 entries, 0 to 206215
Columns: 807 entries, site_info.monkey to source_file
dtypes: int64(802), object(5)
memory usage: 1.2+ GB
None

First 5 rows of combined data:
  site_info.monkey site_info.region  labels.stimID  labels.person  \
0             bert               am              1              1   
1             bert               am              1              1   
2             bert               am              1              1   
3             bert               am              1              1   
4             bert               am              1              1   

  labels.orientation labels.orient_person_combo  time.1_2  time.2_3  time.3_4  \
0              front                    front 1         0         0         0   
1              front                    front 1         0         0         0   
2              front                    front

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Assume 'combined_df' is your DataFrame with all loaded data

# 1. Calculate Total Spikes (Y) for the first 400ms
print("Calculating total spikes for the first 400ms...")
# Identify the time columns for the first 400ms
time_cols_400ms = [f'time.{i}_{i+1}' for i in range(1, 400)] # Columns time.1_2 up to time.399_400
# Filter out columns that might not exist (though unlikely with combined data)
time_cols_400ms = [col for col in time_cols_400ms if col in combined_df.columns]

if not time_cols_400ms:
     print("Error: No time columns found in the expected format 'time.X_Y' up to 400ms in combined_df.")
else:
    # Calculate the sum across these columns for each row (trial)
    combined_df['total_spikes_400ms'] = combined_df[time_cols_400ms].sum(axis=1)
    print(f"Calculated 'total_spikes_400ms'. Head:\n{combined_df['total_spikes_400ms'].head()}")

  

Calculating total spikes for the first 400ms...
Calculated 'total_spikes_400ms'. Head:
0    0
1    1
2    1
3    1
4    0
Name: total_spikes_400ms, dtype: int64


In [6]:
  # 2. Select Predictor Columns (X features) and Target (Y)
print("\nSelecting features and target...")
    # Handle potential missing values in label columns if any exist
combined_df.dropna(subset=['labels.person', 'labels.orientation'], inplace=True) # Drop rows where labels are missing
X_categorical = combined_df[['labels.person', 'labels.orientation']]
Y = combined_df['total_spikes_400ms']
print("Selected features (X_categorical) and target (Y).")

   
    


Selecting features and target...
Selected features (X_categorical) and target (Y).


In [7]:
 # 3. Encode Categorical Predictors
print("\nEncoding categorical features...")
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_encoded = encoder.fit_transform(X_categorical)
feature_names = encoder.get_feature_names_out(X_categorical.columns)
X_encoded_df = pd.DataFrame(X_encoded, columns=feature_names, index=X_categorical.index) # Use index from X_categorical
print(f"Encoded features shape: {X_encoded_df.shape}")



Encoding categorical features...
Encoded features shape: (206216, 33)


In [8]:
# 4. Split Data into Training and Testing Sets
print("\nSplitting data into training and testing sets...")
# Use the prepared X_encoded_df and Y
X_train, X_test, Y_train, Y_test = train_test_split(X_encoded_df, Y, test_size=0.3, random_state=42)
print(f"X_train shape: {X_train.shape}, Y_train shape: {Y_train.shape}")
print(f"X_test shape: {X_test.shape}, Y_test shape: {Y_test.shape}")


# Optional: Clean up memory if the original combined_df is very large and no longer needed in full
# import gc
# del combined_df
# gc.collect()


Splitting data into training and testing sets...
X_train shape: (144351, 33), Y_train shape: (144351,)
X_test shape: (61865, 33), Y_test shape: (61865,)


In [9]:
# --- Create a smaller sample ---
# Adjust frac (e.g., 0.1 for 10%, 0.2 for 20%) based on your memory
sample_fraction = 0.2
print(f"Creating a {sample_fraction*100:.0f}% sample of the data...")

# Sample the encoded features and the target variable
X_encoded_sample = X_encoded_df.sample(frac=sample_fraction, random_state=42)
Y_sample = Y.loc[X_encoded_sample.index] # Ensure Y matches the sampled X indices

print(f"Sampled data shape: X={X_encoded_sample.shape}, Y={Y_sample.shape}")

# --- Split the SAMPLE into training and testing sets ---
X_train_sample, X_test_sample, Y_train_sample, Y_test_sample = train_test_split(
    X_encoded_sample, Y_sample, test_size=0.3, random_state=42
)

print(f"Sample split: X_train={X_train_sample.shape}, X_test={X_test_sample.shape}")

# --- NOW, re-run the model fitting code using these _sample variables ---
# e.g., lasso_cv_model.fit(X_train_sample, Y_train_sample)
# e.g., rf_model.fit(X_train_sample, Y_train_sample)

Creating a 20% sample of the data...
Sampled data shape: X=(41243, 33), Y=(41243,)
Sample split: X_train=(28870, 33), X_test=(12373, 33)


### Multi-linear Regression

In [None]:
# (Run this in your Jupyter Notebook)
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# --- Linear Regression ---

# Initialize the model
lr_model = LinearRegression()

# --- K-Fold Cross-Validation on Training Data ---
kf = KFold(n_splits=3, shuffle=True, random_state=42)
cv_mse_scores = cross_val_score(lr_model, X_train_sample, Y_train_sample, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1) # Use n_jobs=-1 for potentially faster CV
cv_r2_scores = cross_val_score(lr_model, X_train_sample, Y_train_sample, cv=kf, scoring='r2', n_jobs=-1)

#print("--- Linear Regression (Cross-Validation on Training Data) ---")
print(f"Mean CV Negative MSE: {np.mean(cv_mse_scores):.4f} (+/- {np.std(cv_mse_scores):.4f})")
print(f"Mean CV MSE: {-np.mean(cv_mse_scores):.4f}")
print(f"Mean CV R-squared: {np.mean(cv_r2_scores):.4f} (+/- {np.std(cv_r2_scores):.4f})")

# --- Training on Full Training Set & Evaluation on Test Set ---
lr_model.fit(X_train_sample, Y_train_sample)
Y_pred_lr = lr_model.predict(X_test)
mse_lr = mean_squared_error(Y_test, Y_pred_lr)
mae_lr = mean_absolute_error(Y_test, Y_pred_lr)
r2_lr = r2_score(Y_test, Y_pred_lr)

print("\n--- Linear Regression (Evaluation on Test Data) ---")
print(f"Test MSE: {mse_lr:.4f}")
print(f"Test MAE: {mae_lr:.4f}")
print(f"Test R-squared: {r2_lr:.4f}")

### Ridge and Lasso