In [14]:
import pandas as pd
import numpy as np
import jdatetime as jd
import re
from bidi.algorithm import get_display
import arabic_reshaper
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.style import use
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

use("ggplot")
def remove_excess_spaces(input_string):
    # Use re.sub() to replace multiple consecutive spaces with a single space
    result_string = re.sub(r'\s+', ' ', input_string)
    
    # Strip leading and trailing spaces
    result_string = result_string.strip()
    
    return result_string


def clean_string(input_string):
    # Define a regular expression pattern to match special characters (including Unicode)
    special_chars_pattern = r'[^\w\s]+'
    
    # Use re.sub() to replace matched special characters with spaces
    result_string = re.sub(special_chars_pattern, ' ', input_string)
    
    # Define a regular expression pattern to match digits (numbers)
    numbers_pattern = r'\d+'
    
    # Use re.sub() to remove matched digits from the result
    result_string = re.sub(numbers_pattern, '', result_string)
    
    return remove_excess_spaces(result_string.replace("_"," ").replace("V","").replace("ي","ی").replace("ك","ک").replace("آ","ا"))

def remove_words(input_string, word_list):
    for word in word_list:
        input_string = input_string.replace(word,"")
    return input_string

def cleaning(input_string, word_list):
    input_string = remove_words(input_string, word_list)
    return clean_string(input_string)





def make_farsi_text(x):
    reshaped_text = arabic_reshaper.reshape(x)
    farsi_text = get_display(reshaped_text)
    return farsi_text

def anote(ax1):
    for p in ax1.patches:
        h, w, x = p.get_height(), p.get_width(), p.get_x()
        xy = (x + w / 2., h / 2)
        text = f'{h:0.2f}'
        ax1.annotate(text=text, xy=xy, ha='center', va='center',fontsize=7, rotation=90)

def change_postfixes(name, postfix_list):
    for postfix in postfix_list:
        new_postfix = postfix.strip()
        name.replace(postfix, new_postfix)
    return name

In [15]:
def matcher(sales):
    postfix_list = [
        " زاده",
        " پور",
        " نژاد",
        " فر",

    ]
    word_list = [
    "دکتر",
    "داروخانه",
    "شبانه",
    "روزی",
    "روزانه",
    "بیمارستان",
    "درمانگاه",
    "گروه",
    "مرکزی",
    "مرکز",
    "درمانی",
    "پزشکی"]
    pharmas = sales["Customer"].apply(clean_string)
    pharmas = pharmas.apply(lambda w: remove_words(w, word_list))
    pharmas = pharmas.apply(lambda name: change_postfixes(name, postfix_list))
    sales["CustomerName_"] = pharmas
    
    pharmas = sales["Address"]
    pharmas = sales["Address"].fillna("").apply(clean_string)
    pharmas = pharmas.apply(lambda w: remove_words(w, word_list))
    sales["CustomerAddress_"] = pharmas
    
    vec_name = TfidfVectorizer()
    vec_name.fit(sales["CustomerName_"].to_list())
    vec_address = TfidfVectorizer()
    vec_address.fit(sales["CustomerAddress_"].to_list())
    results = []
    
    for prov in sales.Province.unique():
        subset = sales[sales.Province == prov].copy()
        names = list(subset.CustomerName_)
        address = list(subset.CustomerAddress_)
        name_vec = vec_name.transform(names)
        address_vec =  vec_address.transform(address)
        scores_name = cosine_similarity(name_vec,name_vec)
        scores_address = cosine_similarity(address_vec, address_vec)
        scores = 0.5*(scores_name + scores_address)
        scores = scores -(2*np.eye(scores.shape[0]))
        scores = np.triu(scores)
        idxs = scores.argmax(1)
        scores = np.array([scores[i,idxs[i]] for i in range(len(idxs))])
        keys_name = []
        values_name = []
        keys_address = []
        values_address = []
        for i in range(len(names)):
            if scores[i]>=0.5:
    
                key_name = names[i]
                value_neame = names[idxs[i]]
                key_address = address[i]
                value_address = address[idxs[i]]
                if key_name not in values_name:
                    keys_name.append(key_name)
                    values_name.append(value_neame)
    
                if key_address not in values_address:
                    keys_address.append(key_address)
                    values_address.append(value_address)
        matched_name = dict(zip(keys_name,values_name))
        matched_address = dict(zip(keys_address, values_address))
        subset.CustomerName_.replace(matched_name, inplace=True)
        subset.CustomerAddress_.replace(matched_address, inplace=True)
        results.append(subset)
    results = pd.concat(results,axis=0)
    results = results.groupby(["Province", "CustomerName_","CustomerAddress_"]).agg({"Total Sales":"sum",
                                                                          "Days from Last Sale":"min",
                                                                          "SDate":"max"}).reset_index()
    results = results.rename(columns={"CustomerName_":"Customer", "CustomerAddress_":"Address"})
    return results

In [16]:
sales = pd.read_csv("Cetronax.csv").dropna(axis=0)
sales = sales.groupby(["Province", "Customer","Address"]).agg({"Total Sales":"sum",
                                                                          "Days from Last Sale":"min",
                                                                          "SDate":"max"}).reset_index()
print(len(sales))
sales.to_excel("crude_results.xlsx")
for i in range(5):
    sales = matcher(sales)
sales.to_excel("results.xlsx")
print(len(sales))

6162
3052


In [17]:
sales

Unnamed: 0,Province,Customer,Address,Total Sales,Days from Last Sale,SDate
0,آذربایجان شرقی,ابوطالبی تبریز خیابان باغشمال,تبریز خیابان باغمشال مابین خیابان اشکان و ورزش...,1308,218,14011215
1,آذربایجان شرقی,اثنی عشر,توانیر فلکه گلپارک,18,124,14020318
2,آذربایجان شرقی,احدیان تبریز,تبریز خ شهیدمنتظری چهارسوق,40,184,14020120
3,آذربایجان شرقی,اذرابادگان احسان رادین مهر,تبریز خیابان مفتح اول همت اباد نرسیده به ایستگ...,68,23,14020626
4,آذربایجان شرقی,اذربایجان بناب,بناب خیابان شهید مطهری جنب بانک ملی,76,113,14020329
...,...,...,...,...,...,...
3047,یزد,پژوهشکده علوم تولید مثل یزد,یزد صفاییه بلوار بوعلی پژوهشکده علوم تولید مثل,0,237,14011126
3048,یزد,چمرانیزد,خیابان فرخی,11202,91,14020420
3049,یزد,کفیری یزد,یزد بلوار دشتی بعد از چهار راه امیر,110,663,14000925
3050,یزد,یحیی زاده,ارکان خ صدر اباد از طرف میدان انارابتدای خ انا...,48,96,14020415


In [13]:
"abcd".replace("a","k")

'kbcd'