# Chicken Network Group Project

Members: Nino, Silvia, Dugan

### 1) Upload chicken data (use Apache Spark to deal with large dataset easier)

In [1]:
# upload chicken_data.csv to the kaggle environment
# we use kaggle because it gives 30 hours of GPU usage a week and is more stable than google colab

In [2]:
import pandas as pd

# Load the main interaction data
df = pd.read_csv("/kaggle/input/private-chicken-dataset/chicken_data.csv")

In [3]:
df.rename(columns={
    "V1": "chicken1_id",
    "V2": "chicken2_id",
    "V3": "contact_duration",
    "V4": "antenna",
    "time":"timestamp",
    "time_": "mean_interaction"
},inplace=True)

In [4]:
df['date'] = pd.to_datetime(df['date'])

In [5]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

le_antenna = LabelEncoder()
df["antenna_id"] = le_antenna.fit_transform(df["antenna"])

In [6]:
df.dtypes

chicken1_id                 object
chicken2_id                 object
contact_duration           float64
antenna                     object
timestamp                  float64
mean_interaction            object
date                datetime64[ns]
antenna_id                   int64
dtype: object

In [7]:
df.head(5)          # Show first 5 rows

Unnamed: 0,chicken1_id,chicken2_id,contact_duration,antenna,timestamp,mean_interaction,date,antenna_id
0,CA0000001756,CA0000001306,129.0,P3:EE:H3:A8,1499870000.0,2017-07-12T14:26:27.750Z,2017-07-12,19
1,CA0000001306,CA0000001306,129.0,P3:EE:H3:A8,1499870000.0,2017-07-12T14:26:33Z,2017-07-12,19
2,CA0000001306,CA0000001756,454.0,P3:EE:H3:A8,1499870000.0,2017-07-12T14:27:11.750Z,2017-07-12,19
3,CA0000001306,CA0000001306,129.0,P3:EE:H3:A8,1499870000.0,2017-07-12T14:27:17Z,2017-07-12,19
4,CA0000001306,CA0000001756,377.0,P3:EE:H3:A8,1499870000.0,2017-07-12T14:27:55.750Z,2017-07-12,19


In [8]:
print(df.isnull().sum())

chicken1_id         43
chicken2_id         65
contact_duration    28
antenna             28
timestamp           28
mean_interaction    28
date                28
antenna_id           0
dtype: int64


### 2) Handling productivity data

In [9]:
target = pd.read_csv("/kaggle/input/private-hen-productivity-target-labels/production_net.csv", sep=";")

In [10]:
target.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0.1,Unnamed: 0,date,density,nodes_n,crude_clus_coeff,assortativity_degree,path,path_distance,centr_degree_agg,centr_degree,...,Outdoor.Maximum.temperature..C.,Vapour.pressure.9am..kilopascal.,Vapour.pressure.3pm..kilopascal.,Windspeed..m.s.,Comments,flock_date,feed_per_bird,water_per_bird,Relative.humidity.9am,Relative.humidity.3pm
0,1,2017-02-27,141544631712491,542,571740028886168,19590224341695,207517171290012,282864075524019,152355553130393,593270627715519,...,2844,149,162,422,,27/02/2017,109,201,741,392
1,2,2017-03-01,288063936989633,558,525552386306049,697787176634342,172002470994768,701750927720139,250535703943939,991106992786497,...,3088,206,167,367,,1/03/2017,122,215,722,373
2,3,2017-04-25,478294837754647,584,581421673749422,116500750354104,152185201719965,550950472926633,266130548180173,169412462698842,...,2573,16,132,257,,25/04/2017,116,180,786,367
3,4,2017-07-12,691757642694306,532,735592735017446,479125214359517,130824235730569,557696220229681,212197159565581,155312008835648,...,1819,87,87,321,,12/07/2017,135,157,728,383
4,5,2017-07-22,100201005025126,200,759982253771074,399767987213646,204519857735625,211120800997929,191256281407035,108331658291457,...,1927,7,7,169,,22/07/2017,117,165,805,30


In [11]:
target.columns

Index(['Unnamed: 0', 'date', 'density', 'nodes_n', 'crude_clus_coeff',
       'assortativity_degree', 'path', 'path_distance', 'centr_degree_agg',
       'centr_degree', 'centr_clo', 'centr_eigen', 'centr_betw', 'modularity',
       'mod_', 'size_', 'day', 'pk', 'Farm.ID', 'Date', 'Year', 'Week.day',
       'Data.source', 'Shed.name', 'Shed.number', 'Flock.ID',
       'Production.stage', 'Breed.name', 'Age.of.hens..week.',
       'Age.of.hens..days.', 'Hours.of.light', 'Range.access.out',
       'Range.access.in', 'Loose.Birds.caught', 'Ammonium.level',
       'Mortality....Nestbox', 'Mortality...Ground', 'Mortality.hens...Range',
       'Mortality.hens...Cull', 'Total.mortality', 'Mortality....',
       'Cummulative.mortality....', 'Total.feed.consumption.silo.A..kg.',
       'Total.feed.consumption.Silo.B..kg.', 'Feed.consumption.hen.day..g.',
       'Water.consumption.hen.day..ml.', 'Indoor.Minimum.temperature..C.',
       'Indoor.Maximum.temperature..C.', 'Type.of.ventilation', 'Cl

In [19]:
egg_info = target[["date", "Total.eggs","Number.of.hens","Laying.rate...."]]
egg_info.head() # Laying Rate = (Total Eggs / Number of Hens) * 100

Unnamed: 0,date,Total.eggs,Number.of.hens,Laying.rate....
0,2017-02-27,36931,38891,9496027359
1,2017-03-01,36654,38886,9426014504
2,2017-04-25,35752,38586,9265536723
3,2017-07-12,33954,37840,8973044397
4,2017-07-22,33296,37720,8827147402


In [20]:
eggs_per_day_df = egg_info[["date", "Laying.rate...."]].rename(columns={"Laying.rate....": "productivity rate"})

In [21]:
eggs_per_day_df['date'] = pd.to_datetime(eggs_per_day_df['date'])

In [22]:
eggs_per_day_df['productivity rate'] = eggs_per_day_df['productivity rate'].str.replace(',', '.').astype(float)

In [23]:
eggs_per_day_df.dtypes

date                 datetime64[ns]
productivity rate           float64
dtype: object

In [24]:
eggs_per_day_df

Unnamed: 0,date,productivity rate
0,2017-02-27,94.960274
1,2017-03-01,94.260145
2,2017-04-25,92.655367
3,2017-07-12,89.730444
4,2017-07-22,88.271474
...,...,...
112,2017-05-23,90.541844
113,2017-04-03,94.540757
114,2017-05-24,92.220396
115,2017-04-04,90.915904


### 2.5) Incorporate goal of making prediction a day later

In [25]:
# making each date one day earlier
eggs_per_day_df["date"] = eggs_per_day_df["date"] - pd.Timedelta(days=1)  # subtract one day
eggs_per_day_df

Unnamed: 0,date,productivity rate
0,2017-02-26,94.960274
1,2017-02-28,94.260145
2,2017-04-24,92.655367
3,2017-07-11,89.730444
4,2017-07-21,88.271474
...,...,...
112,2017-05-22,90.541844
113,2017-04-02,94.540757
114,2017-05-23,92.220396
115,2017-04-03,90.915904


In [35]:
print(df.shape)
df = df[df["date"] != "2017-12-04"]
df.shape

(48245971, 8)


(48104132, 8)

### 3) Clean data

In [36]:
# Clean interaction data
df = df.drop_duplicates()
df = df.replace(["N/A", "NaN", "", " "], pd.NA)
df = df.dropna()

In [37]:
print(df.isnull().sum())

chicken1_id         0
chicken2_id         0
contact_duration    0
antenna             0
timestamp           0
mean_interaction    0
date                0
antenna_id          0
dtype: int64


In [38]:
df.shape

(48104016, 8)

### 4) Group data by day to build one graph for each unique day

Graph consists of nodes (unique chickens), edges (interactions), and edge features (time, duration, and location of interaction)

In [39]:
df_filtered = df[df["date"].isin(eggs_per_day_df["date"])]

In [40]:
# Split into dictionary where key = date and value = group (as a DataFrame, keeping the 'date' column)
date_to_pandas = {
    date: group.reset_index(drop=True)
    for date, group in df_filtered.groupby("date")
}

In [41]:
date_to_pandas

{Timestamp('2017-03-01 00:00:00'):         chicken1_id   chicken2_id  contact_duration      antenna  \
 0      CA0000001804  CA0000001317              89.0  P3:EE:H3:A7   
 1      CA0000001804  CA0000002125             104.0  P3:EE:H3:A7   
 2      CA0000001599  CA0000002125               3.0  P3:EE:H3:A7   
 3      CA0000001599  CA0000001804              99.0  P3:EE:H3:A7   
 4      CA0000001317  CA0000001599               7.0  P3:EE:H3:A7   
 ...             ...           ...               ...          ...   
 96995  CA0000001394  CA0000001384             150.0  P3:EB:H1:A3   
 96996  CA0000001394  CA0000001870             398.0  P3:EB:H1:A3   
 96997  CA0000001394  CA0000001549             440.0  P3:EB:H1:A3   
 96998  CA0000001394  CA0000001984             320.0  P3:EB:H1:A3   
 96999  CA0000001394  CA0000001439              75.0  P3:EB:H1:A3   
 
           timestamp          mean_interaction       date  antenna_id  
 0      1.488408e+09      2017-03-01T22:33:53Z 2017-03-01       

### 5) Convert each day's info to graph format

In [42]:
!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.5.1+cu124.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-2.5.1+cu124.html
!pip install torch-geometric

Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu124.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.5.0%2Bcu124/torch_scatter-2.1.2%2Bpt25cu124-cp311-cp311-linux_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m92.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt25cu124
Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu124.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.5.0%2Bcu124/torch_sparse-0.6.18%2Bpt25cu124-cp311-cp311-linux_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt25cu124
Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.met

In [43]:
import torch
from torch_geometric.data import Data

def create_graph_from_day(df_day_pd, labels_pd):
    # Step 1: Extract node IDs
    chickens = set(df_day_pd['chicken1_id'].dropna().astype(str)) \
           .union(set(df_day_pd['chicken2_id'].dropna().astype(str)))
    node_ids = sorted(list(chickens))
    node_to_idx = {node: i for i, node in enumerate(node_ids)}
    num_nodes = len(node_ids)

    # Step 2: Build edge list and edge features
    edge_list = []
    edge_feats = []

    for _, row in df_day_pd.iterrows():
        src = node_to_idx[row['chicken1_id']]
        dst = node_to_idx[row['chicken2_id']]

        # Undirected edges
        edge_list.append([src, dst])
        edge_list.append([dst, src])

        # Select all columns except date and chicken IDs
        feat = row.drop(labels=['chicken1_id', 'chicken2_id', 'date', 'mean_interaction', 'antenna']).values.astype(float)
        edge_feats.append(feat)
        edge_feats.append(feat)

    # Step 3: Convert to PyTorch tensors
    edge_index = torch.tensor(edge_list, dtype=torch.long).T
    edge_attr = torch.tensor(edge_feats, dtype=torch.float)
    x = torch.eye(num_nodes, dtype=torch.float)  # One-hot encoded node features

    # Step 4: Target label y
    day = df_day_pd['date'].iloc[0]
    y_val = labels_pd.loc[labels_pd['date'] == day, 'productivity rate'].values[0]
    y = torch.tensor([y_val], dtype=torch.float)

    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)

In [44]:
date_to_graph = {}

# Loop through each date and convert to graph using loaded_data
for date, df_day_pd in date_to_pandas.items():

    # Create graph for the day
    graph = create_graph_from_day(df_day_pd, eggs_per_day_df)
    date_to_graph[date] = graph  # Store in dictionary

  edge_attr = torch.tensor(edge_feats, dtype=torch.float)


In [45]:
# to save the graphs on kaggle

import os

save_dir = "/kaggle/working/graph_data"
os.makedirs(save_dir, exist_ok=True)

for i, (date, graph) in enumerate(date_to_graph.items()):
    file_path = os.path.join(save_dir, f"graph_{i}.pt")
    torch.save(graph, file_path)

In [46]:
# to load the saved graphs later

load_dir = "/kaggle/working/graph_data"
loaded_graphs = {}

if os.path.exists(load_dir):
    for filename in os.listdir(load_dir):
        if filename.endswith(".pt"):
            date = filename.split("_")[1].replace(".pt", "")
            file_path = os.path.join(load_dir, filename)
            loaded_graphs[date] = torch.load(file_path)

  loaded_graphs[date] = torch.load(file_path)


In [47]:
loaded_graphs

{'106': Data(x=[427, 427], edge_index=[2, 277986], edge_attr=[277986, 3], y=[1]),
 '83': Data(x=[505, 505], edge_index=[2, 104016], edge_attr=[104016, 3], y=[1]),
 '73': Data(x=[395, 395], edge_index=[2, 246566], edge_attr=[246566, 3], y=[1]),
 '109': Data(x=[430, 430], edge_index=[2, 295144], edge_attr=[295144, 3], y=[1]),
 '82': Data(x=[531, 531], edge_index=[2, 378094], edge_attr=[378094, 3], y=[1]),
 '20': Data(x=[624, 624], edge_index=[2, 445516], edge_attr=[445516, 3], y=[1]),
 '12': Data(x=[716, 716], edge_index=[2, 348140], edge_attr=[348140, 3], y=[1]),
 '93': Data(x=[464, 464], edge_index=[2, 267620], edge_attr=[267620, 3], y=[1]),
 '21': Data(x=[624, 624], edge_index=[2, 465392], edge_attr=[465392, 3], y=[1]),
 '55': Data(x=[595, 595], edge_index=[2, 510152], edge_attr=[510152, 3], y=[1]),
 '100': Data(x=[426, 426], edge_index=[2, 277242], edge_attr=[277242, 3], y=[1]),
 '89': Data(x=[466, 466], edge_index=[2, 325352], edge_attr=[325352, 3], y=[1]),
 '39': Data(x=[586, 586],

In [48]:
import shutil

shutil.make_archive("/kaggle/working/graph_data_zip", 'zip', "/kaggle/working/graph_data")

'/kaggle/working/graph_data_zip.zip'

In [49]:
import os
print(os.listdir("/kaggle/working"))

['.virtual_documents', 'graph_data', 'graph_data_zip.zip']


### 6) Create GAT Model for Graph Regression

GAT model takes graph as input, applies GAT layers to learn node embeddings, aggregates (mean pooling or attention pooling) to a graph level embedding, and passes that to fully connected layers to predict a single value (productivity)

### 7) Train the model

Split dataset int 80/20 train/test and use MSE loss as we're working with a regression problem.

Use Adam optimizer and track validation loss

Maybe consider early stopping to prevent overfitting

### 8) Evaluate and Fine-tune

Report RMSE and MAE metrics

Try different GAT settings: number of heads, depth

Try different pooling strategies

### 9) Deployment Function

Function to predict productivity given the input of a new day's graph