In [None]:
import pandas as pd

# processing original raw dataset for tabulation in my C++ library.
df = pd.read_csv("./data/maine.csv")
# df.value_counts(col="Precinct")
rank_cols = [col for col in df.columns if "Choice" in col]
df = df[rank_cols]

def clean_name(name):
    if pd.isna(name) or name.strip() == "":
        return None
    name = name.strip()
    if name.lower() in ["undervote", "overvote"]:
        return name.lower()
    return name.split(",")[0].split()[-1]

df = df.applymap(clean_name)
df = df.dropna(how="all")
unique_candidates = sorted(set(
    cell for cell in df.values.flatten()
    if pd.notna(cell) and cell.strip() != ""
))
ballots = df.apply(lambda row: ",".join(cell for cell in row if pd.notna(cell) and cell.strip() != ""), axis=1)

# Write output
with open("./data/converted_ballots.csv", "w", newline="\n") as f:
    f.write(",".join(unique_candidates) + "\n")
    for ballot in ballots:
        f.write(ballot + "\n")


TypeError: DataFrame.value_counts() got an unexpected keyword argument 'col'

In [10]:
def pivot_method_outcomes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Converts wide-form method outcomes (Method, Candidate1, Score1, ...) 
    to long-form (Method, Candidate, Score).
    """
    long_rows = []
    for _, row in df.iterrows():
        method = row['Method']
        for j in range(1, len(row), 2):
            candidate = row[j]
            score = row[j+1] if j+1 < len(row) else None
            if pd.notna(candidate) and pd.notna(score):
                long_rows.append({
                    "Method": method,
                    "Candidate": candidate,
                    "Score": float(score)
                })
    return pd.DataFrame(long_rows)

import pandas as pd

def normalize_method_scores(df: pd.DataFrame) -> pd.DataFrame:
    """
    Normalize scores per method (row-wise), for either:
    - wide-form: Method, Candidate1, Score1, Candidate2, Score2, ...
    - long-form: Method, Candidate, Score

    Returns: long-form DataFrame with columns:
        Method, Candidate, Score, NormalizedScore
    """
    if 'Candidate' in df.columns and 'Score' in df.columns:
        # Long form: normalize directly
        def normalize_group(group):
            min_score = group['Score'].min()
            max_score = group['Score'].max()
            range_score = max_score - min_score if max_score != min_score else 1.0
            group['NormalizedScore'] = (group['Score'] - min_score) / range_score
            return group

        return df.groupby('Method', group_keys=False).apply(normalize_group)

    elif 'Method' in df.columns:
        # Wide form: pivot first
        long_rows = []
        for _, row in df.iterrows():
            method = row['Method']
            candidates = []
            scores = []
            for j in range(1, len(row), 2):
                candidate = row[j]
                score = row[j+1] if j+1 < len(row) else None
                if pd.notna(candidate) and pd.notna(score):
                    candidates.append(candidate)
                    scores.append(float(score))
            if not scores:
                continue
            min_score = min(scores)
            max_score = max(scores)
            range_score = max_score - min_score if max_score != min_score else 1.0
            for candidate, score in zip(candidates, scores):
                normalized_score = (score - min_score) / range_score
                long_rows.append({
                    "Method": method,
                    "Candidate": candidate,
                    "Score": score,
                    "NormalizedScore": normalized_score
                })
        return pd.DataFrame(long_rows)

    else:
        raise ValueError("Unrecognized format. Must contain 'Method' column and either wide candidate/score pairs or long-form 'Candidate' and 'Score'.")


def write_method_outcomes(df: pd.DataFrame, path: str) -> None:
    """
    Writes a DataFrame to the given CSV file path.
    """
    df.to_csv(path, index=False)

import pandas as pd
wide_df = pd.read_csv("./data/method_outcomes.csv")

# Get pivoted long-form
#pivoted_df = pivot_method_outcomes(wide_df)

# Normalize the long-form data
normalized_df = normalize_method_scores(wide_df)

# Save results
#write_method_outcomes(w, "./data/method_outcomes_long.csv")
write_method_outcomes(normalized_df, "./data/method_outcomes_normalized2.csv")

  candidate = row[j]
  score = row[j+1] if j+1 < len(row) else None


In [3]:
import pandas as pd

# Load your raw data
df = pd.read_csv("data/converted_ballots.csv", header=None)

# Rename columns based on rank
df.columns = ['Rank_1', 'Rank_2', 'Rank_3', 'Rank_4', 'Rank_5', 'Rank_6']

# Add Ballot ID (starting from 1)
df['Ballot_ID'] = df.index + 1

# Melt to long format
df_long = df.melt(id_vars='Ballot_ID', 
                  var_name='Rank', 
                  value_name='Candidate')

# Convert Rank_1 → 1, Rank_2 → 2, ...
df_long['Rank'] = df_long['Rank'].str.extract('(\d)').astype(int)

# Optional: filter out undervotes/overvotes
df_cleaned = df_long[~df_long['Candidate'].isin(['undervote', 'overvote'])]

# Save to CSV for Tableau
df_cleaned.to_csv("ballot_cleaned_for_tableau.csv", index=False)


  df_long['Rank'] = df_long['Rank'].str.extract('(\d)').astype(int)
  df = pd.read_csv("data/converted_ballots.csv", header=None)
