# automated results

Here are some quick analyses to look at the data so far :)

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from datetime import datetime

# Load the datasets
morpho_id = pd.read_csv('./ckmr_morpho_id.csv')
mosquito_data = pd.read_csv('./ckmr_morpho_id-mosquito_data.csv')

# Convert date columns to datetime where appropriate
morpho_id['SubmissionDate'] = pd.to_datetime(morpho_id['SubmissionDate'])
morpho_id['initialise-date_collected'] = pd.to_datetime(morpho_id['initialise-date_collected'])

# Join the datasets on KEY and PARENT_KEY
df_morpho = pd.merge(
    mosquito_data, 
    morpho_id, 
    left_on='PARENT_KEY', 
    right_on='KEY', 
    how='inner',
    suffixes=('_mosquito', '_morph')
)

# drop columns
df_morpho = df_morpho.drop(columns=['note', 'morph_id_other', 
      'PARENT_KEY', 'KEY_mosquito', 'initialise-collect_bool',
       'end_note', 'meta-instanceID', 'KEY_morph', 'SubmitterID',
       'SubmitterName', 'AttachmentsPresent', 'AttachmentsExpected', 'Status',
       'ReviewState', 'DeviceID', 'Edits'])

# remove training day data 
df_morpho['Date'] = df_morpho.SubmissionDate.dt.date.astype(str)
df_morpho= df_morpho.query("Date != '2025-02-24'")

# join household datasets 
required_cols_uvlt = ['initialise-gps-Latitude', 'initialise-gps-Longitude', 'collection_end-barcode_outdoor', 'collection_end-barcode_indoor']
df_uvlt = pd.read_csv("./ckmr_uvlt.csv")
df_uvlt = df_uvlt[required_cols_uvlt]
df_uvlt.columns = ['latitude', 'longitude', 'outdoor-uvlt', 'indoor-uvlt']
required_cols_asp = ['initialise-gps-Latitude', 'initialise-gps-Longitude', 'collection_end-barcode']
df_asp = pd.read_csv("./ckmr_sampling.csv")
df_asp = df_asp[required_cols_asp]
df_asp.columns = ['latitude', 'longitude', 'barcode']
df_asp = df_asp.dropna()
df_uvlt = df_uvlt.melt(id_vars=['latitude', 'longitude'], value_vars=['indoor-uvlt', 'outdoor-uvlt'], var_name='type', value_name='barcode')
df_asp = df_asp.assign(type='prokopack')

df_households = pd.concat([df_uvlt, df_asp])

df_morpho = df_morpho.rename(columns={'initialise-barcode':'barcode'})
df_morpho = df_morpho.merge(df_households)

print(f"Number of total Anophelines recorded: {len(df_morpho)}")

Number of total Anophelines recorded: 332


Are there any duplicates? where an eppendorf has been scanned twice?

In [2]:
dups = df_morpho.sample_id.value_counts()[df_morpho.sample_id.value_counts() > 1].to_frame().reset_index().sample_id
df_morpho.query("sample_id in @dups")[['morph_id', 'Date', 'mosquito_sex', 'sample_id']]

Unnamed: 0,morph_id,Date,mosquito_sex,sample_id
37,anopheles_funestus,2025-02-28,female,ep0000915142
60,anopheles_funestus,2025-02-27,male,ep0000915142
311,anopheles_funestus,2025-02-25,male,ep0000916071
312,anopheles_funestus,2025-02-25,male,ep0000916071


How many total mosquitoes from each species?

In [3]:
dups = df_morpho.sample_id.value_counts()[df_morpho.sample_id.value_counts() > 1].to_frame().reset_index().sample_id
df_morpho = df_morpho.query("sample_id not in @dups")

df_morpho.morph_id.value_counts().to_frame()

Unnamed: 0_level_0,count
morph_id,Unnamed: 1_level_1
anopheles_funestus,312
anopheles_gambiae_sl,9
other,4
anopheles_coustani,3


### Plotting species catch by date

In [4]:
# Create species distribution figure using px
df_morpho_day_counts = df_morpho.groupby(['Date', 'morph_id']).agg({'morph_id':'count'}).rename(columns={'morph_id':'count'}).reset_index()
df_morpho_day_counts.columns= ['Date', 'Species', 'Count']

fig_species = px.bar(
    df_morpho_day_counts,
    x='Species',
    y='Count',
    color='Date',
    barmode='stack',
    title='Distribution of Mosquito Species',
    template='simple_white'
)

fig_species.show()

### Trap comparison

Lets plot the per-day (or per-night!) catches per trap.

In [5]:
trap_counts = df_households.groupby('type').size()
samples_per_trap_effort = df_morpho.groupby(['type', 'morph_id']).size() /  trap_counts
samples_per_trap_effort = samples_per_trap_effort.to_frame('mean_catch_per_trap').reset_index()

fig_traps = px.bar(
    samples_per_trap_effort,
    x='type',
    y='mean_catch_per_trap',
    color='morph_id',
    barmode='stack',
    title='Mean catch per single trapping effort',
    template='simple_white'
)

fig_traps.show()

### Males vs Females

Lets plot the sex ratio we are finding in funestus.

In [6]:
# Create sex distribution figure for Anopheles funestus using px
funestus_data = df_morpho[df_morpho['morph_id'] == 'anopheles_funestus']
sex_count = funestus_data['mosquito_sex'].value_counts().reset_index()
sex_count.columns = ['Sex', 'Count']

fig_sex = px.pie(
    sex_count,
    values='Count',
    names='Sex',
    title='Sex Distribution of Anopheles funestus',
    color='Sex',
    color_discrete_map={'female': 'pink', 'male': 'blue'},
    template='simple_white'
)

fig_sex.show()

And show some summary statistics:

In [7]:

# Statistical summary of the data
summary_stats = {
    'Total mosquitoes': len(df_morpho),
    'Number of households': df_morpho['initialise-household-id'].nunique(),
    'Number of collectors': df_morpho['initialise-collector'].nunique(),
    'Collection date range': f"{df_morpho['initialise-date_collected'].min().date()} to {df_morpho['initialise-date_collected'].max().date()}",
    'Species distribution': dict(df_morpho['morph_id'].value_counts()),
    'Overall sex distribution': dict(df_morpho['mosquito_sex'].value_counts()),
    'Anopheles funestus sex distribution': dict(funestus_data['mosquito_sex'].value_counts())
}

print("\nSummary Statistics:\n")
for key, value in summary_stats.items():
    print(f"{key}: {value}")


Summary Statistics:

Total mosquitoes: 328
Number of households: 71
Number of collectors: 6
Collection date range: 2025-02-25 to 2025-02-28
Species distribution: {'anopheles_funestus': 312, 'anopheles_gambiae_sl': 9, 'other': 4, 'anopheles_coustani': 3}
Overall sex distribution: {'female': 209, 'male': 119}
Anopheles funestus sex distribution: {'female': 198, 'male': 114}


### Plot counts on a map

Lets plot these data on a map to look at where we are sampling and where we are getting high numbers.

In [8]:
df_counts = df_morpho.groupby(['latitude', 'longitude', 'type']).agg({'morph_id':'count'}).reset_index()

fig = px.scatter_map(df_counts, lat="latitude", lon="longitude", color="type", size='morph_id', zoom=13, height=550)
fig

With satellite background:

In [9]:

fig = px.scatter_map(df_counts, lat="latitude", lon="longitude", color="type", size='morph_id', zoom=13, height=550, map_style='satellite')
fig

### Great work team!👏 👏 👏