In [4]:
import pandas as pd
import numpy as np

# Load datasets
df1 = pd.read_csv("Raw/dataset1.csv")
df2 = pd.read_csv("Raw/dataset2.csv")

# Preview
print("Dataset 1 head:")
display(df1.head())
print("Dataset 2 head:")
display(df2.head())


Dataset 1 head:


Unnamed: 0,start_time,bat_landing_to_food,habit,rat_period_start,rat_period_end,seconds_after_rat_arrival,risk,reward,month,sunset_time,hours_after_sunset,season
0,30/12/2017 18:37,16.0,rat,30/12/2017 18:35,30/12/2017 18:38,108,1,0,0,30/12/2017 16:45,1.870833,0
1,30/12/2017 19:51,0.074016,fast,30/12/2017 19:50,30/12/2017 19:55,17,0,1,0,30/12/2017 16:45,3.100833,0
2,30/12/2017 19:51,4.0,fast,30/12/2017 19:50,30/12/2017 19:55,41,0,1,0,30/12/2017 16:45,3.1075,0
3,30/12/2017 19:52,10.0,rat,30/12/2017 19:50,30/12/2017 19:55,111,1,0,0,30/12/2017 16:45,3.126944,0
4,30/12/2017 19:54,15.0,rat,30/12/2017 19:50,30/12/2017 19:55,194,1,0,0,30/12/2017 16:45,3.15,0


Dataset 2 head:


Unnamed: 0,time,month,hours_after_sunset,bat_landing_number,food_availability,rat_minutes,rat_arrival_number
0,26/12/2017 16:13,0,-0.5,20,4.0,0.0,0
1,26/12/2017 16:43,0,0.0,28,4.0,0.0,0
2,26/12/2017 17:13,0,0.5,25,4.0,0.0,0
3,26/12/2017 17:43,0,1.0,71,4.0,0.0,0
4,26/12/2017 18:13,0,1.5,44,3.753857,0.0,0


In [5]:
def clean_columns(df):
    df.columns = (
        df.columns.astype(str)
                  .str.strip()
                  .str.lower()
                  .str.replace(r"[^\w\s]", "", regex=True)
                  .str.replace(r"\s+", "_", regex=True)
    )
    return df

df1 = clean_columns(df1)
df2 = clean_columns(df2)


In [6]:
for col in df1.select_dtypes(include="object").columns:
    df1[col] = df1[col].astype(str).str.strip()

for col in df2.select_dtypes(include="object").columns:
    df2[col] = df2[col].astype(str).str.strip()

    

In [7]:
# Dataset1
for col in ["start_time","rat_period_start","rat_period_end","sunset_time"]:
    if col in df1.columns:
        df1[col] = pd.to_datetime(df1[col], errors="coerce", dayfirst=True)

# Dataset2
if "time" in df2.columns:
    df2["time"] = pd.to_datetime(df2["time"], errors="coerce", dayfirst=True)


In [8]:
# Dataset1 numeric columns
for col in ["bat_landing_to_food","seconds_after_rat_arrival","risk","reward","hours_after_sunset"]:
    if col in df1.columns:
        df1[col] = pd.to_numeric(df1[col], errors="coerce")

# Dataset2 numeric columns
for col in ["hours_after_sunset","bat_landing_number","food_availability","rat_minutes","rat_arrival_number"]:
    if col in df2.columns:
        df2[col] = pd.to_numeric(df2[col], errors="coerce")

        

In [9]:
df1 = df1.drop_duplicates().reset_index(drop=True)
df2 = df2.drop_duplicates().reset_index(drop=True)



In [10]:
# Risk / Reward must be 0 or 1
for col in ["risk","reward"]:
    if col in df1.columns:
        df1.loc[~df1[col].isin([0,1]), col] = np.nan

# Remove negative values
for col in ["bat_landing_to_food","seconds_after_rat_arrival","hours_after_sunset"]:
    if col in df1.columns:
        df1.loc[df1[col] < 0, col] = np.nan

for col in ["hours_after_sunset","rat_minutes"]:
    if col in df2.columns:
        df2.loc[df2[col] < 0, col] = np.nan

# Cap hours_after_sunset to 24h
for df in [df1, df2]:
    if "hours_after_sunset" in df.columns:
        df.loc[df["hours_after_sunset"] > 24, "hours_after_sunset"] = np.nan

        
        

In [11]:
df1.to_csv("dataset1_clean.csv", index=False)
df2.to_csv("dataset2_clean.csv", index=False)

print("✅ Cleaning done. Files saved as dataset1_clean.csv and dataset2_clean.csv")



✅ Cleaning done. Files saved as dataset1_clean.csv and dataset2_clean.csv


In [11]:

df1 = pd.read_csv("dataset1_clean.csv")
df2 = pd.read_csv("dataset2_clean.csv")
print(df1.head())   # first 5 rows of dataset1_clean
print(df2.head())   # first 5 rows of dataset2_clean



            start_time  bat_landing_to_food habit     rat_period_start  \
0  2017-12-30 18:37:00            16.000000   rat  2017-12-30 18:35:00   
1  2017-12-30 19:51:00             0.074016  fast  2017-12-30 19:50:00   
2  2017-12-30 19:51:00             4.000000  fast  2017-12-30 19:50:00   
3  2017-12-30 19:52:00            10.000000   rat  2017-12-30 19:50:00   
4  2017-12-30 19:54:00            15.000000   rat  2017-12-30 19:50:00   

        rat_period_end  seconds_after_rat_arrival  risk  reward  month  \
0  2017-12-30 18:38:00                      108.0   1.0     0.0      0   
1  2017-12-30 19:55:00                       17.0   0.0     1.0      0   
2  2017-12-30 19:55:00                       41.0   0.0     1.0      0   
3  2017-12-30 19:55:00                      111.0   1.0     0.0      0   
4  2017-12-30 19:55:00                      194.0   1.0     0.0      0   

           sunset_time  hours_after_sunset  season  
0  2017-12-30 16:45:00            1.870833       0  
1  2

In [12]:
df1.head()   # Will display a nice table in Jupyter
df2.head()


Unnamed: 0,time,month,hours_after_sunset,bat_landing_number,food_availability,rat_minutes,rat_arrival_number
0,2017-12-26 16:13:00,0,,20,4.0,0.0,0
1,2017-12-26 16:43:00,0,0.0,28,4.0,0.0,0
2,2017-12-26 17:13:00,0,0.5,25,4.0,0.0,0
3,2017-12-26 17:43:00,0,1.0,71,4.0,0.0,0
4,2017-12-26 18:13:00,0,1.5,44,3.753857,0.0,0
