In [52]:
import os
import pandas as pd

# Create an empty dictionary to store the counts of each value
value_counts = {}

# Define the folder path
folder_path = 'C:/Users/41763/Desktop/pmcintyr.github.io/recensements'

# Iterate through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".xlsx"):
        file_path = os.path.join(folder_path, filename)
        # Read the Excel file
        df = pd.read_excel(file_path)
        
        # Get the second column
        column_values = df.iloc[:, 1]
        # Count the occurrences of each value in the second column
        for value in column_values:
            if value in value_counts:
                value_counts[value] += 1
            else:
                value_counts[value] = 1

# Convert the dictionary to a pandas DataFrame
result_df = pd.DataFrame(list(value_counts.items()), columns=['Value', 'Count'])

# Sort the DataFrame by the 'Count' column in descending order
result_df = result_df.sort_values('Count', ascending=False)

# Values to filter out
values_to_filter = ["pre","martherey","st jean","laurent","de bourg","francois","annee de naissance","pierre","rue du pont","chauerau","chailly","marteray","fre","halle","jn francois","derriere","id id","laurens","grand s jean","grand jean","devant","cheneau","cile de vant","st laurens","et","cile derriere","du marche","patud","francoise","gd st jean","hallede st laurent","haurent","grand fs jean","st laur","d halle","walle","jn jean","grand","paud","frand chene","cite derrier","moulins de pepinet","rue de martheray","cheneau bourg","marthony","marthorey","jennes","grand f jean","salud","de francois","du sont","cuchy","sejan","le pont","duz re","montee de st laurent","marthe","chemin de bourg","place du pont","marthery","f pierre","martherey e","dre","de pierre","la barre","grand fr jean","d etienne","valud","st fran","chavanne","ft pierre","grand sn jean","aisse","cite devriere","id .","luite dri","no de leur recepisse","cile derrier","flaurent","della barre","mererie","halle de laurent","du re","chaucrau .","l walle","marthere","theneau de bourg","e aisse","rue de francois","cete derriere","no des","pennes","marthoray","etienne","marberay","st laurant","cite derric","ctraz","chaucraie","bourge","ste pierre","theneau bourg","halle de s laurt","slaurent","petit gjean","montee de st monte","lite de vans","l hopital","rue du pre","de mercerie","pont","monorier","nerie","no es","monte st laurent","uve","grangeneuve","sdu","dean","ler","luchy","de st laurent","no de","grand sjean","tre","cite dessons","monbe clauron","rapaz","pre du marche","sallaz","grotte","monblesson","halle f laurent so","veuf","marherey","chaueran","st martin","martberty","challes st laurens","chaz","boston","plaurent","de marthenay","cite ederriere","petit - jean","hallede tlaurent"]

# Filtering out specific values
result_df_filtered = result_df[~result_df['Value'].isin(values_to_filter)]

#take top 100
dictionary = result_df_filtered.sort_values('Count', ascending=False)[0:100]

# Save the DataFrame to a CSV file
dictionary.to_csv("dictionary.csv", index=False)

In [53]:
from Levenshtein import distance

# Create an empty DataFrame to store the final result
final_result = pd.DataFrame(columns=['nom_rue', 'Suggested_Street_Name', 'Levenshtein_Distance'])

# Function to find the closest match from the dictionary
def find_closest_match(word, dictionary):
    min_distance = float('inf')
    closest_keyword = None
    for keyword in dictionary['Value']:
        d = distance(str(word), str(keyword))
        if d < min_distance:
            min_distance = d
            closest_keyword = keyword
    return closest_keyword, min_distance

# Iterate through each file in the folder and process street names
for filename in os.listdir(folder_path):
    if filename.endswith(".xlsx"):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_excel(file_path)

        # Adding new columns for keyword and Levenshtein distance
        df['Suggested_Street_Name'] = ''
        df['Levenshtein_Distance'] = 0

        #TODO "." et "·|" veut dire meme chose que ligne precedente, faire comprendre ca au systeme de suggestions
        for index, row in df.iterrows():
            street_name = row['nom_rue']
            if street_name in (".", "·"):
                if index > 0:  # Check if it's not the first row
                    street_name = df.loc[index - 1, 'nom_rue']
                    if street_name in (".", "·"):
                        print("Problem with dots is here")
                    # Otherwise, you can update the current row in the DataFrame
                    else:
                        df.loc[index, 'nom_rue'] = street_name
                else:
                    print("Problem with the first row having '.' or '·'")

            closest_keyword, min_distance = find_closest_match(street_name, dictionary)
            df.at[index, 'Suggested_Street_Name'] = closest_keyword
            df.at[index, 'Levenshtein_Distance'] = min_distance

        # Append the required columns to the final result
        df_temp = df[['nom_rue', 'Suggested_Street_Name', 'Levenshtein_Distance']]
        final_result = final_result.append(df_temp, ignore_index=True)

# Sort the DataFrame by the 'Levenshtein_Distance' column
final_result.sort_values('Levenshtein_Distance', inplace=True)

final_result.to_excel("distances.xlsx", index=False)

In [54]:
#"fre" et "tre" deviennent "barre" plutot que "du pre"
final_result.loc[final_result['nom_rue'].isin(['fre', 'tre']), 'Suggested_Street_Name'] = 'du pre'
#"gs jean" devient "jean" plutot que "grand st jean"
final_result.loc[final_result['nom_rue'] == 'gs jean', 'Suggested_Street_Name'] = 'grand st jean'
#"veuve" devient "vennes" plutot que "grange veuve"
final_result.loc[final_result['nom_rue'] == 'veuve', 'Suggested_Street_Name'] = 'grange veuve'
#"chaz" devient "jean" plutot que "etraz"
final_result.loc[final_result['nom_rue'] == 'chaz', 'Suggested_Street_Name'] = 'etraz'
#"pieur" devient "palud" plutot que "st pierre"
final_result.loc[final_result['nom_rue'] == 'pieur', 'Suggested_Street_Name'] = 'st pierre'
#"rue" et "ruc" devient "barre" et "ouchy" mais devraient etre effacés
final_result = final_result[final_result['nom_rue'] != 'rue']
final_result = final_result[final_result['nom_rue'] != 'ruc']
# "boston" doit etre supprimé car il devrait etre mappé à "bosson" pas "bourg" mais bosson apparait pas dans le dictionnaire
final_result = final_result[final_result['nom_rue'] != 'boston']
# "et" et "id id" doit etre supprimé car il ne peut pas etre mappé a un nom de rue
final_result = final_result[final_result['nom_rue'] != 'et']
final_result = final_result[final_result['nom_rue'] != 'id id']

#"grand" devient "jean" mais devrait etre effacé
final_result = final_result[final_result['nom_rue'] != 'grand']
#"veuf" devient "jean" mais devrait etre "grange veuve"
final_result.loc[final_result['nom_rue'] == 'veuf', 'Suggested_Street_Name'] = 'grange veuve'

#"francois" devient "francs" plutot que "st francois"
final_result.loc[final_result['nom_rue'] == 'francois', 'Suggested_Street_Name'] = 'st francois'

#"halle" devient "chally" plutot que "l halle"
final_result.loc[final_result['nom_rue'] == 'halle', 'Suggested_Street_Name'] = 'l halle'

#"vichy" devient "ouchy" mais devrait etre enlevé (s'il n'apparait pas dans le dictionnaire)

final_result.to_excel("distances.xlsx", index=False)

In [55]:
# Print statistics of the number of rows per Levenshtein distance
statistics = final_result['Levenshtein_Distance'].value_counts().sort_index()
print("Top 10 Levenshtein minimum distance row counts")
print(statistics[0:10])

# Calculate and print the number of rows with Levenshtein distance <= 3
num_rows_levenshtein_3 = sum(final_result['Levenshtein_Distance'] <= 3)
print("Number of rows with Levenshtein distance <= 3:", num_rows_levenshtein_3)

# Calculate and print the total number of rows
total_rows = len(final_result)
print("Total number of rows:", total_rows)

# Print statistics of the values and occurrences of each suggested keyword
keyword_statistics = final_result['Suggested_Street_Name'].value_counts()

print("Top 10 suggested keyword values:")
print(keyword_statistics[0:10])

# Find the top 10 suggested street names
top_suggested_street_names = final_result['Suggested_Street_Name'].value_counts().head(10).index.tolist()

# Create a dictionary to store the top 10 original street names for each suggested street name
top_original_street_names = {}

# Iterate through each of the top 10 suggested street names
for suggested_street_name in top_suggested_street_names:
    filtered_data = final_result[final_result['Suggested_Street_Name'] == suggested_street_name]
    original_street_counts = filtered_data['nom_rue'].value_counts()
    top_original_streets = list(original_street_counts.head(4).items())
    top_original_street_names[suggested_street_name] = top_original_streets

# Print the results
for suggested_street_name, top_original_streets in top_original_street_names.items():
    print(f"For the suggested street name '{suggested_street_name}':")
    print("Top 4 original street names and their occurrences:")
    for i, (original_street, occurrences) in enumerate(top_original_streets, start=1):
        print(f"{i}. {original_street}, Occurrences: {occurrences}")
    print("----")


Top 10 Levenshtein minimum distance row counts
0    34740
1    10540
2    11432
3    15729
4     8172
5     4127
6     2868
7     2283
8     1593
9     1106
Name: Levenshtein_Distance, dtype: int64
Number of rows with Levenshtein distance <= 3: 72441
Total number of rows: 95348
Top 10 suggested keyword values:
martheray        5467
bourg            5367
du pre           5000
st laurent       4433
jean             3902
ouchy            2890
palud            2800
st francois      2800
grand st jean    2758
st pierre        2577
Name: Suggested_Street_Name, dtype: int64
For the suggested street name 'martheray':
Top 4 original street names and their occurrences:
1. martheray, Occurrences: 1477
2. martherey, Occurrences: 1437
3. marteray, Occurrences: 265
4. rue de martheray, Occurrences: 94
----
For the suggested street name 'bourg':
Top 4 original street names and their occurrences:
1. bourg, Occurrences: 3878
2. de bourg, Occurrences: 824
3. bourge, Occurrences: 56
4. bourgeois, Occurre