In [2]:
import pandas as pd
import glob
import os
from sklearn.preprocessing import MultiLabelBinarizer

In [3]:
# Configuration
folder_path = '/Users/owlxshri/desktop/coral reef/mlpr-project/p1_shrijak'
output_csv = os.path.join(folder_path, 'combined_coral_data.csv')  # Output in same folder

# Get all .txt files in the directory
txt_files = glob.glob(os.path.join(folder_path, '*.txt'))

# List to hold all data
all_data = []

# Process each file
for file_path in txt_files:
    # Extract filename
    file_name = os.path.basename(file_path)
    
    # Read the whitespace-delimited file
    df = pd.read_csv(file_path, delim_whitespace=True)
    
    # Add source file column
    df.insert(0, 'Source_File', file_name)
    
    all_data.append(df)

# Combine all data
combined_df = pd.concat(all_data, ignore_index=True)

# Save to CSV
combined_df.to_csv(output_csv, index=False)

print(f"Success! Combined {len(txt_files)} files into:\n{output_csv}")

Success! Combined 4 files into:
/Users/owlxshri/desktop/coral reef/mlpr-project/p1_shrijak/combined_coral_data.csv


In [4]:
file_path = "/Users/owlxshri/desktop/coral reef/mlpr-project/statisticalinf/finaldata.csv"
df = pd.read_csv(file_path)

df['Genera'] = df['Genera'].apply(lambda x: x.split(', '))

mlb = MultiLabelBinarizer()
one_hot = pd.DataFrame(mlb.fit_transform(df['Genera']), columns=mlb.classes_)

df = pd.concat([df.drop(columns=['Genera']), one_hot], axis=1)

output_path = "/Users/owlxshri/desktop/coral reef/mlpr-project/statisticalinf/finaldata_onehot.csv"
df.to_csv(output_path, index=False)

print(f"One-hot encoded file saved to: {output_path}")


One-hot encoded file saved to: /Users/owlxshri/desktop/coral reef/mlpr-project/statisticalinf/finaldata_onehot.csv


In [None]:

file_path = "/Users/owlxshri/desktop/coral reef/mlpr-project/statisticalinf/finaldata.csv"
df = pd.read_csv(file_path)

df['Genera'] = df['Genera'].apply(lambda x: x.split(', '))


mlb = MultiLabelBinarizer()
one_hot = pd.DataFrame(mlb.fit_transform(df['Genera']), columns=mlb.classes_)


df = pd.concat([df.drop(columns=['Genera']), one_hot], axis=1)


df.drop(columns=['pCO2'], inplace=True, errors='ignore')  


output_path = "/Users/owlxshri/desktop/coral reef/mlpr-project/statisticalinf/finaldata_onehotafterdrop.csv"
df.to_csv(output_path, index=False)

print(f"One-hot encoded file (without pCO2) saved to: {output_path}")


One-hot encoded file (without pCO2) saved to: /Users/owlxshri/desktop/coral reef/mlpr-project/statisticalinf/finaldata_onehotafterdrop.csv


In [5]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer


file_path = "/Users/owlxshri/desktop/coral reef/mlpr-project/statisticalinf/finaldata_onehotafterdrop.csv"
df = pd.read_csv(file_path)

reef_dummies = pd.get_dummies(df['Reef Name'], prefix='Reef')

df = pd.concat([df.drop(columns=['Reef Name']), reef_dummies], axis=1)


output_path = "/Users/owlxshri/desktop/coral reef/mlpr-project/statisticalinf/finaldata_onehotafterdropreefname.csv"
df.to_csv(output_path, index=False)

print(f"Complete one-hot encoded file saved to: {output_path}")

Complete one-hot encoded file saved to: /Users/owlxshri/desktop/coral reef/mlpr-project/statisticalinf/finaldata_onehotafterdropreefname.csv
