In [4]:
import pandas as pd

def process_file(file_path):
    df = pd.read_csv(file_path)

    df_false = df[df['Reclassified'] == False]

    reason_counts = df_false['Explanation'].value_counts()

    print(f"Results for {file_path}:")
    print("Unique Reasons with Counts:")
    for reason, count in reason_counts.items():
        print(f"{reason}: {count}")
    print("\n")

process_file("tippecanoe/Combined_Reconciliation_2023_2.csv")

Results for tippecanoe/Combined_Reconciliation_2023_2.csv:
Unique Reasons with Counts:
LAND: 31
Value per GRM per IC 6-1.1-4-39(b): 24
Insufficant Sales: 15
Invalid Sale-No Consideration: 11
SIGNIFCANT CHANGES: 10
EXCLUDED - UNDER $10,000: 10
Relocation: 10
no money amount: 10
Maybe: 9
Part Of Porfolio Sale: 9
Commissioners Sale: 8
SOLD TO TENANT: 6
EXCLUDED - SALE PRICE INCLUDES 10 PARCELS: 6
SALE INCLUDED NON-REAL PROPERTY, VALUE NOT DISCLOSED: 6
1/2 NON ADJ PARCEL TR: 5
EXCLUDED - HAS SINCE SOLD: 5
SIGNIFICATE CHANGES: 5
EXCLUDED - SOLD FOR $10,000: 5
AMMF IS TAKING TITLE TO PROPERTY AND WILL THEN TRANSFER TO NEW OWNER - NOT A SALE: 5
Buyer not knowledgeable: 5
V-V: 4
EXCLUDED - 6 PARCLES IN 1 SALE PRICE: 4
2 DAYS ON MARKET: 4
HIGHER END HOME: 4
Res And Ci Sold Together: 4
Problem Deed: 4
ONLY 3 DAYS ON MARKET: 4
2/2 NON ADJ PARCEL TR: 4
INSUFFICIANT SALES: 4
ONE SALE PRICE FOR 4 NON0CONTIG PROPERTIES: 4
INVALID; PENDED AT LISTING: 3
delevoper sale: 3
Valid Sale- ComImp: 3
SIGNFICAN

In [5]:
process_file("tippecanoe/Combined_Reconciliation_2024_2.csv")

Results for tippecanoe/Combined_Reconciliation_2024_2.csv:
Unique Reasons with Counts:
No house included in sale: 28
Invalid-Pending at listing: 16
Relocation: 15
INSUFFICANT SALES: 9
per legislation, lowest of 3 approaches to value was used: 7
land order done this year: 6
TEMPORARY TRANSFER: 6
Per legislation, lowest of three approaches used: 5
Percent Complete: 5
Dulicate Sale: 5
Insufficant Imp Comm Sale for a Study: 4
Sale involved PP: 4
House Incomplete: 4
Support land: 4
Insufficant Sales for Imp Ind Study: 4
RELOCATION SERVICES: 4
599 sale.: 4
Invalid-New Contruction: 4
Value set by legislation: 4
* INVALID - SOLD BEFORE LISTED *: 3
RELOCATION SALE: 3
* INVALID - LAND ONLY *: 3
Habitat home sale.: 3
SOLD SIGNIFICANTLY HIGH THAN ASKING PRIVE: 3
INVALID; PENDED AT LIST: 2
RESURVEY: 2
GOING CONCERN VALUE INCLUDED: 2
Invalid-Sold at listing: 2
TEAR DOWN: 2
LAND ONLY: 2
Invalid Sale-No Consideration: 2
Same buyer seller in two months: 2
Per legislation, lowest of three approaches to 

In [17]:
import re
from Levenshtein import distance as levenshtein_distance

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    return text

def dynamic_threshold(text_length):
    if text_length <= 10:
        return 3
    elif text_length <= 20:
        return 6
    else:
        return 12

def cluster_reasons_dynamic_threshold(reasons):
    reasons = [normalize_text(reason) for reason in reasons]
    groups = []
    visited = set()
    for reason in reasons:
        if reason in visited:
            continue
        reason_length = len(reason)
        threshold = dynamic_threshold(reason_length)
        current_group = [reason]
        visited.add(reason)
        for other_reason in reasons:
            if other_reason not in visited:
                other_length = len(other_reason)
                pair_threshold = min(threshold, dynamic_threshold(other_length))
                if levenshtein_distance(reason, other_reason) <= pair_threshold:
                    current_group.append(other_reason)
                    visited.add(other_reason)
        groups.append(current_group)
    return groups

def print_clusters_with_multiple_items(clusters):
    clusters_with_multiple_items = [cluster for cluster in clusters if len(cluster) > 1]
    print(f"Number of clusters with multiple reasons: {len(clusters_with_multiple_items)}")
    for i, cluster in enumerate(clusters_with_multiple_items):
        print(f"Cluster {i + 1}:")
        for reason in cluster:
            print(f"  - {reason}")
        print()

In [18]:
df_2023 = pd.read_csv("tippecanoe/Combined_Reconciliation_2023_2.csv")

df_2023_false = df_2023[df_2023['Reclassified'] == False]
reasons_2023 = df_2023_false['Explanation'].dropna().unique()
clusters_2023 = cluster_reasons_dynamic_threshold(reasons_2023)

print("Clusters of Similar Reasons:")
for i, cluster in enumerate(clusters_2023):
    print(f"Cluster {i + 1}: {cluster}")

Clusters of Similar Reasons:
Cluster 1: ['adj property owner']
Cluster 2: ['valued per ic 611442']
Cluster 3: ['value per grm per ic 611439b']
Cluster 4: ['interior not all complete at time of sale']
Cluster 5: ['mutiproperty sale']
Cluster 6: ['price recuction']
Cluster 7: ['last lot in subdivision']
Cluster 8: ['one of the last remaining lots selling for premium']
Cluster 9: ['invalid  mostly land sale', 'invalid  portfoloio sale']
Cluster 10: ['invalid  future valid sale same year']
Cluster 11: ['invalid  large amount of land']
Cluster 12: ['home vp sold for land']
Cluster 13: ['invalid  excess res']
Cluster 14: ['structures not habitable']
Cluster 15: ['delevoper sale']
Cluster 16: ['temporary transfer']
Cluster 17: ['new owner is building a home']
Cluster 18: ['invalid unresolved problems with deedwork']
Cluster 19: ['foundation issues present at time of sale']
Cluster 20: ['invalid  non market sale per dan', 'invalid  nonmarket transaction']
Cluster 21: ['invalid  valid land']
Cl

In [19]:
print("Clusters of Similar Reasons with Multiple Items:")
print_clusters_with_multiple_items(clusters_2023)

Clusters of Similar Reasons with Multiple Items:
Number of clusters with multiple reasons: 79
Cluster 1:
  - invalid  mostly land sale
  - invalid  portfoloio sale

Cluster 2:
  - invalid  non market sale per dan
  - invalid  nonmarket transaction

Cluster 3:
  - invalid  letter of condemation
  - invalid saleno consideration
  - invalid  no consideration

Cluster 4:
  - sold to tenant
  - sold to gvt entity

Cluster 5:
  - sale not submitted from auditors office in time to be validated before 2024
  - sale not submitted from auditors office in time to be validated before 2025

Cluster 6:
  - put house in fair cond
  - put house in vp cond

Cluster 7:
  - adjusted land influence
  - adjusted land influence factor

Cluster 8:
  - new building since sale
  - sold building to tenant

Cluster 9:
  - sold twince in one year
  - sold two times  one year
  - sold two times in one year
  - sold 3 time in one year

Cluster 10:
  - purchased land wbillboard
  - purchased from landlord

Cluster 1

In [20]:
df_2024 = pd.read_csv("tippecanoe/Combined_Reconciliation_2024_2.csv")

df_2024_false = df_2024[df_2024['Reclassified'] == False]
reasons_2024 = df_2024_false['Explanation'].dropna().unique()
clusters_2024 = cluster_reasons_dynamic_threshold(reasons_2024)

print("Clusters of Similar Reasons:")
for i, cluster in enumerate(clusters_2024):
    print(f"Cluster {i + 1}: {cluster}")

Clusters of Similar Reasons:
Cluster 1: ['property is part of a mh park will use the lowest of 3 approaches to value']
Cluster 2: ['sale price includes work still to be completed tls']
Cluster 3: ['relocation sale', 'relocation services', 'dulicate sale']
Cluster 4: ['destressed sale']
Cluster 5: ['land order done this year']
Cluster 6: ['137  filed']
Cluster 7: ['house relisted for higher value']
Cluster 8: ['quick sale']
Cluster 9: ['house needs work']
Cluster 10: ['price reduction']
Cluster 11: ['temporary transfer']
Cluster 12: ['specialty property']
Cluster 13: ['invalid  boarded up']
Cluster 14: ['invalidlandfuture bld', 'invalid  ptrivate sale']
Cluster 15: ['value must be lowest of three ind cost is not accurate']
Cluster 16: ['sold for the land', 'sold to tenant']
Cluster 17: ['residential building lot']
Cluster 18: ['originally marked invalid  non market sale']
Cluster 19: ['sale did not include new mh moved in']
Cluster 20: ['right of way grant']
Cluster 21: ['land and lake 

In [21]:
print("Clusters of Similar Reasons with Multiple Items:")
print_clusters_with_multiple_items(clusters_2024)

Clusters of Similar Reasons with Multiple Items:
Number of clusters with multiple reasons: 25
Cluster 1:
  - relocation sale
  - relocation services
  - dulicate sale

Cluster 2:
  - invalidlandfuture bld
  - invalid  ptrivate sale

Cluster 3:
  - sold for the land
  - sold to tenant

Cluster 4:
  - seller from out of state needed to liquidate
  - seller moved out of state needed to liquidate

Cluster 5:
  - going concern value included
  - going concern included in sale

Cluster 6:
  - guardians deed
  - warranty deed

Cluster 7:
  - out of state sale
  - out of state seller

Cluster 8:
  - sellers will remain living there
  - seller will remain in property

Cluster 9:
  - sale included equipment
  - sale involved 2 homes

Cluster 10:
  - invalid saleno consideration
  - invalidnew contruction
  - gift or no consideration

Cluster 11:
  - invesment property
  - lakefront property

Cluster 12:
  - invalid  sale of hwy land
  - invalid sold at listing
  - invalid grade is wrong
  - inva