In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz

def combined_similarity(name1, name2, vectorizer):
    cosine_sim = cosine_similarity(vectorizer.transform([name1]), vectorizer.transform([name2]))[0][0]
    levenshtein_sim = fuzz.ratio(name1.lower(), name2.lower()) / 100.0
    jaccard_sim = len(set(name1.lower()).intersection(set(name2.lower()))) / len(set(name1.lower()).union(set(name2.lower())))
    return (0.5 * cosine_sim) + (0.3 * levenshtein_sim) + (0.2 * jaccard_sim)

def match_lists(valeur_list1, valeur_list2, threshold):
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
    tfidf_matrix1 = vectorizer.fit_transform(valeur_list1)
    tfidf_matrix2 = vectorizer.transform(valeur_list2)
    
    best_matches = []
    matched_set = set()
    all_match_scores = []

    # Compute all match scores first
    for company1 in valeur_list1:
        for company2 in valeur_list2:
            score = combined_similarity(company1, company2, vectorizer)
            all_match_scores.append((company1, company2, score))

    # Sort by similarity score descending to ensure best matches are assigned first
    all_match_scores.sort(key=lambda x: x[2], reverse=True)

    # Assign matches based on the best scores
    for company1, company2, score in all_match_scores:
        if score >= threshold and company1 not in matched_set and company2 not in matched_set:
            best_matches.append((company1, company2, score))
            matched_set.add(company1)
            matched_set.add(company2)

    # Convert to DataFrame
    best_matches_df = pd.DataFrame(best_matches, columns=["Company (List 1)", "Best Match (List 2)", "Similarity Score"])

    # Identify remaining unmatched companies
    unmatched_list1 = [company for company in valeur_list1 if company not in matched_set]
    unmatched_list2 = [company for company in valeur_list2 if company not in matched_set]

    print("Best Matched Companies and Their Scores:")
    print(best_matches_df)
    print("\nUnmatched Companies from List 1:")
    print(unmatched_list1)
    print("\nUnmatched Companies from List 2:")
    print(unmatched_list2)

    return best_matches_df, unmatched_list1, unmatched_list2

def remove_high_similarity(df, threshold=0.55):
    """
    Removes rows from the DataFrame where the similarity score is above the given threshold.
    """
    df_filtered = df[df["Similarity Score"] <= threshold]
    return df_filtered

def concat_and_save_csv(df1, df2, filename="merged_matches.csv"):
    """
    Concatenates two DataFrames, keeps only 'Company' and 'Best Match' columns, and saves to CSV with column names in uppercase.
    """
    merged_df = pd.concat([df1[["Company (List 1)", "Best Match (List 2)"]], df2[["Company (List 1)", "Best Match (List 2)"]]])
    merged_df.columns = ["COMPANY", "BEST MATCH"]
    merged_df.to_csv(filename, index=False)
    print(f"Merged DataFrame saved as {filename}")

In [2]:
import pandas as pd

# Load both CSV files
file1_path = 'csv/continu.csv'
file2_path = 'csv/fixing.csv'
file3_path = 'csv/repartition_S.csv'

df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)
df3 = pd.read_csv(file3_path)

# Extract the "valeur" column from both dataframes and concatenate them into a single list
valeur_list1 = df1["VALEUR"].tolist() + df2["VALEUR"].tolist() + df3["VALEUR"].tolist()

In [3]:
# Load the CSV file
file_path = "csv/companies_sectors.csv"  # Change this if needed
df = pd.read_csv(file_path)

# Remove rows where the sector is 'HORS COTE'
df = df[df['SECTOR'] != 'Hors Cote'] 

# Save the filtered DataFrame back to a new CSV file
df.to_csv(file_path, index=False)

# Extract the "valeur" column and convert it to a list
valeur_list2 = df["VALEUR"].tolist()

In [4]:
# Define two lists of company names
list1 = valeur_list1
list2 = valeur_list2

# Call the function
best_matches_df, unmatched_list1, unmatched_list2 = match_lists(list1, list2, threshold=0)

Best Matched Companies and Their Scores:
      Company (List 1) Best Match (List 2)  Similarity Score
0                  TPR                 TPR          1.000000
1      CARTHAGE CEMENT     CARTHAGE CEMENT          1.000000
2               UNIMED              UNIMED          1.000000
3          EURO-CYCLES         EURO-CYCLES          1.000000
4           SOTIPAPIER          SOTIPAPIER          1.000000
..                 ...                 ...               ...
69                 SAH           SAH LILAS          0.522609
70    PLAC. TSIE-SICAF  PLACEM. DE TUNISIE          0.441454
71                 ICF              I.C.F.          0.351000
72             BH BANK                  BH          0.329752
73  ATELIER MEUBLE INT                 SAM          0.090333

[74 rows x 3 columns]

Unmatched Companies from List 1:
[]

Unmatched Companies from List 2:
[]


In [5]:
filename="csv/crx.csv"
merged_df = best_matches_df[["Company (List 1)", "Best Match (List 2)"]]
merged_df.columns = ["COMPANY", "BEST MATCH"]
merged_df.to_csv(filename, index=False)
print(f"Merged DataFrame saved as {filename}")

Merged DataFrame saved as csv/crx.csv
