In [52]:
import pandas as pd
import re
from IPython.display import display, HTML

In [53]:
pd.set_option('display.max_colwidth', -1)

## Columns names
estab_name = "Establishment Name"
cleaned_synon = "Cleansing Synonyms"

In [54]:
def read_file(exten, file = None, column = None):
    df = None
    if exten == 'csv':
        df = pd.read_csv('data/' + file, encoding = 'utf-8')
    elif exten == 'excel':
        df = pd.read_excel('data/' + file, column, encoding = 'utf-8')
    return df

def export_to_file(exten, df, file_name):
    if exten == 'excel':
        writer = pd.ExcelWriter(file_name)
        df.to_excel(writer,'Sheet1')
        writer.save()
    elif exten == 'csv':
        df.to_csv(file_name, encoding='utf-8', index=False)

### DQ Reading and summary

In [55]:
data = read_file("excel", "34K_Cleansing_source.xlsx", "data")

In [56]:
data.describe(include="all")

Unnamed: 0,Establishment Name,English Synonyms,Arabic Synonyms,Cleansing Synonyms
count,34407,1180,1191,0.0
unique,33338,279,359,
top,مطعم رستم,There is no Synonym available,There is no Synonym available,
freq,2,902,831,
mean,,,,
std,,,,
min,,,,
25%,,,,
50%,,,,
75%,,,,


## Cleansing

In [57]:
## Keep the original data
data_clean = data.copy()

### Remove English words

In [58]:
data_clean[estab_name] = data_clean[estab_name].str.replace("[a-zA-Z]+", '', regex=True)
data.describe(include="all")

Unnamed: 0,Establishment Name,English Synonyms,Arabic Synonyms,Cleansing Synonyms
count,34407,1180,1191,0.0
unique,33338,279,359,
top,مطعم رستم,There is no Synonym available,There is no Synonym available,
freq,2,902,831,
mean,,,,
std,,,,
min,,,,
25%,,,,
50%,,,,
75%,,,,


### Remove special chars

In [59]:
data_clean[cleaned_synon] = data_clean[estab_name].str.replace("[^\u0621-\u06FF]+", ' ', regex=True)
data_clean[cleaned_synon] = data_clean[cleaned_synon].apply(lambda x: x.strip() if len(x.strip()) > 1 else x)

data_clean.describe()

Unnamed: 0,Establishment Name,English Synonyms,Arabic Synonyms,Cleansing Synonyms
count,34407.0,1180,1191,34407.0
unique,33104.0,279,359,31769.0
top,,There is no Synonym available,There is no Synonym available,
freq,70.0,902,831,364.0


In [60]:
display(data_clean[1120:1140])

Unnamed: 0,Establishment Name,English Synonyms,Arabic Synonyms,Cleansing Synonyms
1120,البحبي لمقاولات البناء,There is no Synonym available,There is no Synonym available,البحبي لمقاولات البناء
1121,الخياط العربي المميز للرجال,There is no Synonym available,There is no Synonym available,الخياط العربي المميز للرجال
1122,المحترف لنسخ المفاتيح,There is no Synonym available,There is no Synonym available,المحترف لنسخ المفاتيح
1123,الدار العقارية (ش.م.ع) - ياس مول - البدالة8,There is no Synonym available,There is no Synonym available,الدار العقارية ش م ع ياس مول البدالة
1124,البدر الجديد لقطع غيار السيارات,There is no Synonym available,There is no Synonym available,البدر الجديد لقطع غيار السيارات
1125,الذهب للصرافة - فرع ابو ظبي,There is no Synonym available,There is no Synonym available,الذهب للصرافة فرع ابو ظبي
1126,المخبز المصري,There is no Synonym available,There is no Synonym available,المخبز المصري
1127,المخلط للعطور (ذ.م.م),There is no Synonym available,There is no Synonym available,المخلط للعطور ذ م م
1128,المخلط للعطور (ش.ذ.م.م),There is no Synonym available,There is no Synonym available,المخلط للعطور ش ذ م م
1129,المداد للسيارات الجديدة والمستعملة,,,المداد للسيارات الجديدة والمستعملة


### Remove abbreviations

In [61]:
data_clean[cleaned_synon] = data_clean[cleaned_synon].apply(lambda x: re.sub("\s+ش ذ م م(\s+|$)", ' ', x))
data_clean[cleaned_synon] = data_clean[cleaned_synon].apply(lambda x: re.sub("\s+ذ م م(\s+|$)", ' ', x))
data_clean[cleaned_synon] = data_clean[cleaned_synon].apply(lambda x: re.sub("\s+ذ م(\s+|$)", ' ', x))
data_clean[cleaned_synon] = data_clean[cleaned_synon].apply(lambda x: re.sub("\s+ش م ع(\s+|$)", ' ', x))
data_clean[cleaned_synon] = data_clean[cleaned_synon].apply(lambda x: re.sub("\s+(الفرع).*$", ' ', x))
data_clean[cleaned_synon] = data_clean[cleaned_synon].apply(lambda x: re.sub("\s+(فرع).*$", ' ', x))
data_clean[cleaned_synon] = data_clean[cleaned_synon].apply(lambda x: re.sub("\s+(مركز الاتصال).*$", ' ', x))
data_clean[cleaned_synon] = data_clean[cleaned_synon].apply(lambda x: re.sub("\s+(مركز الاتص).*$", ' ', x))
data_clean[cleaned_synon] = data_clean[cleaned_synon].apply(lambda x: re.sub("\s+(البدالة).*$", ' ', x))
data_clean[cleaned_synon] = data_clean[cleaned_synon].apply(lambda x: re.sub("\s+(اهلاً بك في اتصالات).*$", ' ', x))

data_clean.describe()

Unnamed: 0,Establishment Name,English Synonyms,Arabic Synonyms,Cleansing Synonyms
count,34407.0,1180,1191,34407.0
unique,33104.0,279,359,31409.0
top,,There is no Synonym available,There is no Synonym available,
freq,70.0,902,831,364.0


In [62]:
display(data_clean[1120:1140])

Unnamed: 0,Establishment Name,English Synonyms,Arabic Synonyms,Cleansing Synonyms
1120,البحبي لمقاولات البناء,There is no Synonym available,There is no Synonym available,البحبي لمقاولات البناء
1121,الخياط العربي المميز للرجال,There is no Synonym available,There is no Synonym available,الخياط العربي المميز للرجال
1122,المحترف لنسخ المفاتيح,There is no Synonym available,There is no Synonym available,المحترف لنسخ المفاتيح
1123,الدار العقارية (ش.م.ع) - ياس مول - البدالة8,There is no Synonym available,There is no Synonym available,الدار العقارية ياس مول
1124,البدر الجديد لقطع غيار السيارات,There is no Synonym available,There is no Synonym available,البدر الجديد لقطع غيار السيارات
1125,الذهب للصرافة - فرع ابو ظبي,There is no Synonym available,There is no Synonym available,الذهب للصرافة
1126,المخبز المصري,There is no Synonym available,There is no Synonym available,المخبز المصري
1127,المخلط للعطور (ذ.م.م),There is no Synonym available,There is no Synonym available,المخلط للعطور
1128,المخلط للعطور (ش.ذ.م.م),There is no Synonym available,There is no Synonym available,المخلط للعطور
1129,المداد للسيارات الجديدة والمستعملة,,,المداد للسيارات الجديدة والمستعملة


### Remove duplicates

In [63]:
data_clean.drop_duplicates(subset = {estab_name, cleaned_synon}, inplace = True)
data_clean.describe(include="all")

Unnamed: 0,Establishment Name,English Synonyms,Arabic Synonyms,Cleansing Synonyms
count,33104,1180,1191,33104.0
unique,33104,279,359,31409.0
top,لابيل للشوكولا والزهور - فرع 4,There is no Synonym available,There is no Synonym available,
freq,1,902,831,126.0


In [64]:
display(data_clean[:20])

Unnamed: 0,Establishment Name,English Synonyms,Arabic Synonyms,Cleansing Synonyms
0,كنتاكي,KFC,كي اف سي,كنتاكي
1,بيتزا هت,There is no Synonym available,There is no Synonym available,بيتزا هت
2,شركة الامارات للوجبات السريعة (ذ.م.م) - ماكدونالدز,MAC,ماك,شركة الامارات للوجبات السريعة ماكدونالدز
3,هارديز,There is no Synonym available,There is no Synonym available,هارديز
4,برجر كينج - تكساس تشيكن - فرع فيرست فود سيرفيسيز (ذ.م.م),There is no Synonym available,There is no Synonym available,برجر كينج تكساس تشيكن
5,مطعم الفروج الطازج,Al Farooj Fresh Chicken Restaurant,مطعم الفروج الطازج,مطعم الفروج الطازج
6,مطعم الخروف الذهبي,There is no Synonym available,There is no Synonym available,مطعم الخروف الذهبي
7,مصرف ابوظبي الاسلامي - مركز الاتصال - البدالة,ADIB Call Center,اديب بنك,مصرف ابوظبي الاسلامي
8,وزارة الداخلية,There is no Synonym available,الداخلية,وزارة الداخلية
9,مطعم بحر الامارات,There is no Synonym available,There is no Synonym available,مطعم بحر الامارات


## Remove synonyms = estab name

In [65]:
data_clean = data_clean.copy()
data_clean[cleaned_synon][data_clean[estab_name] == data_clean[cleaned_synon]] = None
display(HTML(data_clean[:500].to_html()))

Unnamed: 0,Establishment Name,English Synonyms,Arabic Synonyms,Cleansing Synonyms
0,كنتاكي,KFC,كي اف سي,
1,بيتزا هت,There is no Synonym available,There is no Synonym available,
2,شركة الامارات للوجبات السريعة (ذ.م.م) - ماكدونالدز,MAC,ماك,شركة الامارات للوجبات السريعة ماكدونالدز
3,هارديز,There is no Synonym available,There is no Synonym available,
4,برجر كينج - تكساس تشيكن - فرع فيرست فود سيرفيسيز (ذ.م.م),There is no Synonym available,There is no Synonym available,برجر كينج تكساس تشيكن
5,مطعم الفروج الطازج,Al Farooj Fresh Chicken Restaurant,مطعم الفروج الطازج,
6,مطعم الخروف الذهبي,There is no Synonym available,There is no Synonym available,
7,مصرف ابوظبي الاسلامي - مركز الاتصال - البدالة,ADIB Call Center,اديب بنك,مصرف ابوظبي الاسلامي
8,وزارة الداخلية,There is no Synonym available,الداخلية,
9,مطعم بحر الامارات,There is no Synonym available,There is no Synonym available,


## Testing

### Export to CSV

In [66]:
export_to_file("csv", data_clean, "data/34K_Cleansing_source_output.csv")