In [1]:
from dataclasses import dataclass, fields, is_dataclass
from typing import Dict, Any, Type, Callable, Optional


def csv_row(file_path: str = None) -> Type[Any]:
    """
    Decorator to enhance a dataclass with CSV row initialization and conversion capabilities.
    Assumes that the class is a dataclass and that keys in the input dictionary match the dataclass fields.
    """
    def decorator(cls: Type[Any]) -> Type[Any]:
        if not isinstance(cls, type):
            print(cls)
            raise TypeError("Expected a class, got {}".format(type(cls).__name__))
        if not is_dataclass(cls):
            cls = dataclass(cls)
        
        original_init = cls.__init__

        def __init__(self, row: Dict[str, Any]):
            """
            Initialize the dataclass with data from a CSV row.
            """
            for field in fields(cls):
                setattr(self, field.name, row.get(field.name, getattr(self, field.name, None)))
            original_init(self, **{f.name: getattr(self, f.name) for f in fields(cls)})

        cls.__init__ = __init__

        def to_dict(self) -> Dict[str, Any]:
            """
            Convert the dataclass back to a dictionary suitable for writing to a CSV file.
            """
            return {field.name: getattr(self, field.name) for field in fields(self)}
        
        cls.to_dict = to_dict

        cls.is_csv_row = True

        cls.file_path = file_path

        return cls
    return decorator

def is_csv_row(cls: Type[Any]) -> bool:
    """
    Check if a class has been enhanced with the 'csv_row' decorator.
    """
    return hasattr(cls, "is_csv_row") and cls.is_csv_row

In [2]:
from typing import List
import pandas as pd
from functools import lru_cache

class CSVLoader:
    def __init__(self, dataclass, file_path: str=None, sep: str = "\t"):
        """
        Initialize with a dataclass type and a file path.
        :param dataclass: The dataclass type to convert rows into.
        :param file_path: The CSV file to read from.
        """
        self.dataclass = dataclass
        if file_path is not None:
            self.file_path = file_path
        elif hasattr(dataclass, "file_path"):
            self.file_path = getattr(dataclass, "file_path")
        else:
            raise ValueError("No file path provided.")
        self.sep = sep

    @lru_cache(maxsize=None)
    def __load_df(self) -> pd.DataFrame:
        """
        Load the CSV file into a DataFrame.
        """
        df = pd.read_csv(self.file_path, sep=self.sep, header=0, skip_blank_lines=True)
        return df[(df['Scenario ID'] <= 286) & (df['Reviewer ID'].isin([3]))]

    def read(self) -> List:
        """
        Read the CSV file and return a list of dataclass instances.
        Each row in the DataFrame is converted into an instance of the specified dataclass.
        """
        df = self.__load_df()
        dict_rows = df.to_dict(orient='records')
        dict_rows = [{k.lower().replace(" ", "_").replace("-", "_"): v for k, v in row.items()} for row in dict_rows]
        return [self.dataclass(row) for row in dict_rows]
    

In [3]:
@csv_row()
class ResultRow:
    essential: int
    singular: int
    complete: int
    integrous: int
    high_quality: int

In [4]:
mixed_quality_seed_results = CSVLoader(ResultRow, file_path="data/Annotation - Resolved MQ(286) - analysis.csv", sep=",").read()
high_quality_seed_results = CSVLoader(ResultRow, file_path="data/Annotation - Resolved HQ(286) - analysis.csv", sep=",").read()

In [5]:
#mixed_quality_seed_results

In [6]:
from collections import Counter
from typing import List, Dict
from dataclasses import fields
from math import isnan

def count_quality_categories(results: List[ResultRow]) -> Dict[str, int]:
    counts = Counter()
    for result in results:
        for field in fields(ResultRow):
            r = getattr(result, field.name)
            if isinstance(r, float):
                if isnan(r):
                    continue
                r = int(r)
                
            counts[field.name] += r
    return dict(counts)

In [7]:
mixed_quality_seed_count = count_quality_categories(mixed_quality_seed_results)
high_quality_seed_count = count_quality_categories(high_quality_seed_results)

In [8]:
high_quality_seed_count

{'essential': 242,
 'singular': 218,
 'complete': 257,
 'integrous': 233,
 'high_quality': 184}

In [9]:
#mixed_quality_seed_count["essential"] += 61

In [27]:
from scipy.stats.contingency import odds_ratio
from scipy.stats import fisher_exact



# def fisher(control, treatment, field_name: str, n_total_control: int = 286, n_total_treatment: Optional[int] = None) -> float:
def fisher(n_control, n_treatment, n_total_control: int = 286, n_total_treatment: Optional[int] = None) -> float:
    if n_total_treatment is None:
        n_total_treatment = n_total_control
    # Create a 2x2 table
    # columns are control and treatment
    # rows are if the criteria is met or not
    table = [
        [n_treatment, n_control],
        [n_total_treatment - n_treatment, n_total_control - n_control]
    ]
    _, pvalue = fisher_exact(table)
    _odds_ratio = odds_ratio(table)
    print(f"|    | Treatment | Control |  |")
    print("| --- | --- | --- |")
    print(f"| Met | {n_treatment} | {n_control} | ")
    print(f"| Not met | {n_total_treatment - n_treatment} | {n_total_control - n_control} | ")
    ci = _odds_ratio.confidence_interval(confidence_level=0.95)
    print(f"OR = {_odds_ratio.statistic}")
    print(f"OR (95% CI) = ({ci[0]:.2f}, {ci[1]:.2f})")
    print(f"P-value: {pvalue:.3f}")

In [28]:
for field in fields(ResultRow):
    n_mixed = mixed_quality_seed_count[field.name]
    n_high = high_quality_seed_count[field.name]
    print(field.name)
    fisher(n_mixed, n_high)

essential
|    | Treatment | Control |  |
| --- | --- | --- |
| Met | 242 | 234 | 
| Not met | 44 | 52 | 
OR = 1.2217937515249462
OR (95% CI) = (0.77, 1.95)
P-value: 0.434
singular
|    | Treatment | Control |  |
| --- | --- | --- |
| Met | 218 | 163 | 
| Not met | 68 | 123 | 
OR = 2.4153505410195963
OR (95% CI) = (1.66, 3.53)
P-value: 0.000
complete
|    | Treatment | Control |  |
| --- | --- | --- |
| Met | 257 | 262 | 
| Not met | 29 | 24 | 
OR = 0.8120880176539667
OR (95% CI) = (0.44, 1.49)
P-value: 0.564
integrous
|    | Treatment | Control |  |
| --- | --- | --- |
| Met | 233 | 136 | 
| Not met | 53 | 150 | 
OR = 4.834440625363863
OR (95% CI) = (3.27, 7.22)
P-value: 0.000
high_quality
|    | Treatment | Control |  |
| --- | --- | --- |
| Met | 184 | 117 | 
| Not met | 102 | 169 | 
OR = 2.601138718713788
OR (95% CI) = (1.83, 3.71)
P-value: 0.000


In [29]:
hm = 175 + 184
mm = 105 + 117
total = 461
fisher(mm, hm,  total)


|    | Treatment | Control |  |
| --- | --- | --- |
| Met | 359 | 222 | 
| Not met | 102 | 239 | 
OR = 3.783343303600302
OR (95% CI) = (2.82, 5.10)
P-value: 0.000
