In [13]:
import os
import pandas as pd

# Create an empty dictionary to store the counts of each value
value_counts = {}

# Define the folder path
folder_path = 'C:/Users/41763/Desktop/pmcintyr.github.io/recensements'

# Iterate through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".xlsx"):
        file_path = os.path.join(folder_path, filename)
        # Read the Excel file
        df = pd.read_excel(file_path)
        # Get the second column
        column_values = df.iloc[:, 1]
        # Count the occurrences of each value in the second column
        for value in column_values:
            if value in value_counts:
                value_counts[value] += 1
            else:
                value_counts[value] = 1

# Convert the dictionary to a pandas DataFrame
result_df = pd.DataFrame(list(value_counts.items()), columns=['Value', 'Count'])

# Sort the DataFrame by the 'Count' column in descending order
result_df = result_df.sort_values('Count', ascending=False)

# Values to filter out
values_to_filter = ["pre","martherey","st jean","laurent","de bourg","francois","annee de naissance","pierre","rue du pont","chauerau","chailly","marteray","fre","halle","jn francois","derriere","id id","laurens","grand s jean","grand jean","devant","cheneau","cile de vant","st laurens","et","cile derriere","du marche","patud","francoise","gd st jean","hallede st laurent","haurent","grand fs jean","st laur","d halle","walle","jn jean","grand","paud","frand chene","cite derrier","moulins de pepinet","rue de martheray","cheneau bourg","marthony","marthorey","jennes","grand f jean","salud","de francois","du sont","cuchy","sejan","le pont","duz re","montee de st laurent","marthe","chemin de bourg","place du pont","marthery","f pierre","martherey e","dre","de pierre","la barre","grand fr jean","d etienne","valud","st fran","chavanne","ft pierre","grand sn jean","aisse","cite devriere","id .","luite dri","no de leur recepisse","cile derrier","flaurent","della barre","mererie","halle de laurent","du re","chaucrau .","l walle","marthere","theneau de bourg","e aisse","rue de francois","cete derriere","no des","pennes","marthoray","etienne","marberay","st laurant","cite derric","ctraz","chaucraie","bourge","ste pierre","theneau bourg","halle de s laurt","slaurent","petit gjean","montee de st monte","lite de vans","l hopital","rue du pre","de mercerie","pont","monorier","nerie","no es","monte st laurent","uve","grangeneuve","sdu","dean","ler","luchy","de st laurent","no de","grand sjean","tre","cite dessons","monbe clauron","rapaz","pre du marche","sallaz","grotte","monblesson","halle f laurent so","veuf","marherey","chaueran","st martin","martberty","challes st laurens","chaz","boston","plaurent","de marthenay","cite ederriere","petit - jean","hallede tlaurent"]

# Filtering out specific values
result_df_filtered = result_df[~result_df['Value'].isin(values_to_filter)]

#take top 100
dictionary = result_df_filtered.sort_values('Count', ascending=False)[0:100]

# Save the DataFrame to a CSV file
dictionary.to_csv("task1.csv", index=False)


#problèmes de dictionnaire:
#peut-etre que "chaz" devrait etre "etraz" plutot qu'un différent mot-clé => rajouté chaz aux values_to_filter
#"boston" devrait etre "bosson" => rajouté boston aux values_to_filter
#"plaurent" devrait etre "st laurent" plutot que le mot-clé pour place de st laurent => rajouté "plaurent", enlevé "place t laurent" des values_to_filter

In [14]:
from Levenshtein import distance

# Create an empty DataFrame to store the final result
final_result = pd.DataFrame(columns=['nom_rue', 'Suggested_Street_Name', 'Levenshtein_Distance'])

# Function to find the closest match from the dictionary
def find_closest_match(word, dictionary):
    min_distance = float('inf')
    closest_keyword = None
    for keyword in dictionary['Value']:
        d = distance(str(word), str(keyword))
        if d < min_distance:
            min_distance = d
            closest_keyword = keyword
    return closest_keyword, min_distance

# Iterate through each file in the folder and process street names
for filename in os.listdir(folder_path):
    if filename.endswith(".xlsx"):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_excel(file_path)

        # Adding new columns for keyword and Levenshtein distance
        df['Suggested_Street_Name'] = ''
        df['Levenshtein_Distance'] = 0

        for index, row in df.iterrows():
            street_name = row[1]
            closest_keyword, min_distance = find_closest_match(street_name, dictionary)
            df.at[index, 'Suggested_Street_Name'] = closest_keyword
            df.at[index, 'Levenshtein_Distance'] = min_distance

        # Append the required columns to the final result
        df_temp = df[['nom_rue', 'Suggested_Street_Name', 'Levenshtein_Distance']]
        final_result = final_result.append(df_temp, ignore_index=True)

# Sort the DataFrame by the 'Levenshtein_Distance' column
final_result.sort_values('Levenshtein_Distance', inplace=True)

# Save the final DataFrame to an Excel file
final_result.to_excel("distances.xlsx", index=False)


In [17]:
#problèmes de suggestion: 
#TODO modifier la distance levenshtein pour les modifications manuelles des suggestions!
#TODO . veut dire meme chose que ligne precedente, faire comprendre ca au systeme de suggestions

#"." et "·|" devient "ais" mais devrait etre effacé (car "ais" fait 3 lettres, la distance maximale de Levenshtein autorisée)
final_result = final_result[final_result['nom_rue'] != '.']
final_result = final_result[final_result['nom_rue'] != '.|']
#"fre" et "tre" deviennent "barre" plutot que "du pre"
final_result.loc[final_result['nom_rue'].isin(['fre', 'tre']), 'Suggested_Street_Name'] = 'du pre'
#"gs jean" devient "jean" plutot que "grand st jean"
final_result.loc[final_result['nom_rue'] == 'gs jean', 'Suggested_Street_Name'] = 'grand st jean'
#"veuve" devient "vennes" plutot que "grange veuve"
final_result.loc[final_result['nom_rue'] == 'veuve', 'Suggested_Street_Name'] = 'grange veuve'
#"chaz" devient "jean" plutot que "etraz"
final_result.loc[final_result['nom_rue'] == 'chaz', 'Suggested_Street_Name'] = 'etraz'
#"pieur" devient "palud" plutot que "st pierre"
final_result.loc[final_result['nom_rue'] == 'pieur', 'Suggested_Street_Name'] = 'st pierre'
#"rue" et "ruc" devient "barre" et "ouchy" mais devraient etre effacés
#"grand" devient "jean" mais devrait etre effacé
final_result = final_result[final_result['nom_rue'] != 'grand']
#"veuf" devient "jean" mais devrait etre "grange veuve"
final_result.loc[final_result['nom_rue'] == 'veuf', 'Suggested_Street_Name'] = 'grange veuve'

#"francois" devient "francs" plutot que "st francois"
final_result.loc[final_result['nom_rue'] == 'francois', 'Suggested_Street_Name'] = 'st francois'

#"halle" devient "chally" plutot que "l halle"
final_result.loc[final_result['nom_rue'] == 'halle', 'Suggested_Street_Name'] = 'l halle'

#"vichy" devient "ouchy" mais devrait etre enlevé (s'il n'apparait pas dans le dictionnaire)

final_result.to_excel("distances.xlsx", index=False)

In [26]:
# Print statistics of the number of rows per Levenshtein distance
statistics = final_result['Levenshtein_Distance'].value_counts().sort_index()
print("Top 10 Levenshtein minimum distance row counts")
print(statistics[0:10])

# Calculate and print the number of rows with Levenshtein distance <= 3
num_rows_levenshtein_3 = sum(final_result['Levenshtein_Distance'] <= 3)
print("Number of rows with Levenshtein distance <= 3:", num_rows_levenshtein_3)

# Calculate and print the total number of rows
total_rows = len(final_result)
print("Total number of rows:", total_rows)

# Print statistics of the values and occurrences of each suggested keyword
keyword_statistics = final_result['Suggested_Street_Name'].value_counts()

print("Top 10 suggested keyword values:")
print(keyword_statistics[0:10])

Top 10 Levenshtein minimum distance row counts
0    31856
1     9383
2    10006
3    15896
4    10683
5     4741
6     6153
7     2061
8     1473
9     1044
Name: Levenshtein_Distance, dtype: int64
Number of rows with Levenshtein distance <= 3: 67141
Total number of rows: 95808
Top 10 suggested keyword values:
bourg            9284
jean             6602
martheray        4972
du pre           4736
st laurent       4282
palud            2707
ouchy            2643
st francois      2533
grand st jean    2490
st pierre        2354
Name: Suggested_Street_Name, dtype: int64
