# DATA QUALITY - Fruital census

# Library

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd
import numpy as np
from pyspark.sql.functions import pandas_udf
from pyspark.sql.functions import udf
from typing import List, Dict, Optional, Callable, Union
import re
import string
import pyspark
import shapely
import pandas as pd
import geopandas as gpd
import nltk
import unidecode
from shapely.geometry import Polygon, Point
from geopandas import GeoDataFrame
import os
import fastparquet 
from nltk import ngrams
from ngram import NGram
from textdistance import damerau_levenshtein
from textdistance import jaro_winkler
from textdistance import sorensen_dice
from textdistance import jaccard
from textdistance import overlap
from textdistance import ratcliff_obershelp 


# Matching preparation functions

**function toools**

In [5]:
def function_vectorizer(input_function: Callable) -> Callable:
    """This function takes an input funcion that works with arbitrary input
    and vectorizes it so that the input function is applied to iterables
    (such as columns of a Spark DataFrame).
    The ouptut is always going to be pandas Series to ensure compliance
    with Spark DataFrames.
    
    Arguments:
        input_function {Callable} -- The input function.
    
    Returns:
        Callable -- The function vectorized (i.e. acting on each element of an
        iterable).
    """
 
    def vectorized_function(*args):
        return pd.Series([input_function(*tup) for tup in zip(*args)])
    
    return vectorized_function


**Test processing**

In [6]:
name_column_blacklist = ["cafe", "cf", "restaurant", "estaurant", "rest", "ag", "ste", "café", "snack", "hotel", "sarl", "rotisserie", "marrakech"]
name_column_regex_replace = {r"\'": "", r"\d{5}": "", r"\s+": " "}
address_column_blacklist = []
address_column_regex_replace = {r"\'": "", r"\s+": " ", "avenu ": "av ", "boulevard ": "bd "}

In [7]:
# Snowball stemmer was chosen in favor of Porter Stemmer which is a bit more aggressive and tends to remove too much from a word
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download("punkt")
nltk.download("stopwords")
 
# unidecode is the library needed for ASCII folding
from unidecode import unidecode
import string
# Compact Language Detector v3 is a very fast and performant algorithm by Google for language detection: more info here: https://pypi.org/project/pycld3/
import re
 
import pyspark.sql.functions as F
from typing import List, Dict, Optional, Callable
from langdetect import detect

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [8]:
name_column_blacklist = ["cafe", "cf", "restaurant", "estaurant", "rest", "ag", "ste", "café", "snack", "hotel", "sarl", "rotisserie", "marrakech"]
name_column_regex_replace = {r"\'": "", r"\d{5}": "", r"\s+": " "}
address_column_blacklist = []
address_column_regex_replace = {r"\'": "", r"\s+": " ", "avenu ": "av ", "boulevard ": "bd "} 
def make_text_prep_func(row, word_blacklist, regex_replace, colonne) :
    try:
      STOPWORDS_EN = stopwords.words("english")
      STOPWORDS_FR = stopwords.words("french")
      STEMMER_EN = SnowballStemmer(language='english')
      STEMMER_FR = SnowballStemmer(language='french')
    except:
      nltk.download("punkt")
      nltk.download("stopwords")
      STOPWORDS_EN = stopwords.words("english")
      STOPWORDS_FR = stopwords.words("french")
      STEMMER_EN = SnowballStemmer(language='english')
      STEMMER_FR = SnowballStemmer(language='french')
    s=row[colonne]
 
    if s is None or s=="":
       return ""
#     STOPWORDS_EN = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
#     STOPWORDS_FR = ['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'ayante', 'ayantes', 'ayants', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 'auront', 'aurais', 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient', 'eut', 'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse', 'eusses', 'eût', 'eussions', 'eussiez', 'eussent']
    stop_words = STOPWORDS_EN + word_blacklist
    stemmer = STEMMER_EN
    s = s.lower()
     # check if the language is French
    s_lang = detect(s)
    if s_lang=="fr":
      stop_words = STOPWORDS_FR + word_blacklist
      stemmer = STEMMER_FR
 
    
    stop_words = STOPWORDS_FR + word_blacklist
    stemmer = STEMMER_FR
    s_clean = s.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    s_tokens = word_tokenize(s_clean)
    s_tokens_no_stop = [word for word in s_tokens if word not in stop_words]
    s_tokens_stemmed = [stemmer.stem(word) for word in s_tokens_no_stop]
    s_ascii = unidecode(" ".join(s_tokens_stemmed))
    
    for regex, replace in regex_replace.items():
      s_ascii = re.sub(regex, replace, s_ascii)
    return(s_ascii.strip())



In [9]:
address_column_blacklist

[]

**geospatial function**

In [38]:
def haversine_distance(row):

    longit_a=row.R_place_longitude
    latit_a=row.R_place_latitude
    longit_b=row.L_LONGITUDE
    latit_b=row.L_LATITUDE
    # Transform to radians
    longit_a, latit_a, longit_b, latit_b = map(np.radians, [longit_a,  latit_a, longit_b, latit_b])
    dist_longit = longit_b - longit_a
    dist_latit = latit_b - latit_a
    # Calculate area
    area = np.sin(dist_latit/2)**2 + np.cos(latit_a) * np.cos(latit_b) * np.sin(dist_longit/2)**2
    # Calculate the central angle
    central_angle = 2 * np.arcsin(np.sqrt(area))
    #   central_angle = 2 * np.arctan2(np.sqrt(area), np.sqrt(1-area))
    radius = 6371000
    # Calculate Distance
    distance = central_angle * radius
    return abs(round(distance, 2))
  
# haversine_distance_sdf = F.pandas_udf(function_vectorizer(haversine_distance),"double")

In [11]:
def sdf_to_gdf(sdf: pyspark.sql.dataframe.DataFrame,
               longitude: str = "longitude",
               latitude: str = "latitude",
               crs: str = "epsg:4326"
              ) -> GeoDataFrame:
  
  pdf = sdf.toPandas()
  gdf = gpd.GeoDataFrame(pdf, geometry=gpd.points_from_xy(pdf[longitude], pdf[latitude]), crs=crs)
  return(gdf)

**test similaity**

In [12]:
def compound_similarity(row,col1,col2):
    s1 = row[col1]
    s2 = row[col2]
    if s1 is None:
        s1 = ""
    if s2 is None:
        s2 = ""
    if s1 == "" and s2 == "":
        return 0.
    scores = [
                 damerau_levenshtein.normalized_similarity(s1, s2),
                 jaro_winkler.normalized_similarity(s1, s2),
                 sorensen_dice.normalized_similarity(s1, s2),
                 jaccard.normalized_similarity(s1, s2),
                 overlap.normalized_similarity(s1, s2),
                 ratcliff_obershelp.normalized_similarity(s1, s2),
                 NGram.compare(s1, s2, N=2)
             ]
    return np.mean(scores)

# Data :CMD Golden

**cmd dataset**

In [13]:
cmd = pd.read_excel("D:/data_quality/data/customer_invoice_tizi_ouzou.xlsx") [["Client", "LONGITUDE", "LATITUDE", "Nom", "Adresse"]]

In [14]:
cmd=cmd.rename(columns={"Client":"CUSTOMER_COD"})
cmd

Unnamed: 0,CUSTOMER_COD,LONGITUDE,LATITUDE,Nom,Adresse
0,72011,4.103267,36.607317,MANOUCHE AMAR,RUE TALA ATHMANE
1,72009,4.080755,36.620868,SADAOUI HAMID,RUE AMIROUCHE
2,72000,4.087680,36.617363,SARL TOUTE ELLA,RUE IGHIL HAMOU
3,71999,4.043520,36.695920,BOUZERD ABDENOUR,BENI DOUALA CENTRE
4,71998,4.178637,36.768272,DLIM SAID,RESIDENCE HADJILI BOUKHALFA
...,...,...,...,...,...
5572,29040,3.740273,36.612807,MDN/74° B.I.A (FOYER)TIZI-OUZOU,TIZI-OUZOU W-TIZI-OUZOU
5573,29012,4.317823,36.758588,MDN/12°G.I.R(FOYER) FREHA,FREHA
5574,29002,3.739110,36.623918,MDN/10°B.P.C(TAFSA),AZZAZGA W-TIZI-OUZOU
5575,28996,2.816000,36.472308,MDN/74 B.I.A (ORDINAIRE)TIZI-OUZOU,TIZI-OUZOU


**cmd golden Id**

In [15]:
cmd_golden_uri="C:/Users/Salif SAWADOGO/OneDrive - EQUATORIAL COCA-COLA BOTTLING COMPANY S.L/dynamic segmentation/matching/output/horeca_tz_customer_subset.csv"
cmd_golden_ids= pd.read_csv(cmd_golden_uri) [["CUSTOMER_COD"]]

In [16]:
cmd_golden_ids

Unnamed: 0,CUSTOMER_COD
0,56341
1,38014
2,38861
3,54921
4,37832
...,...
193,37785
194,38635
195,41385
196,54782


**merge datasets**

In [17]:
cmd_golden = cmd.merge(cmd_golden_ids, on="CUSTOMER_COD") 
cmd_golden

Unnamed: 0,CUSTOMER_COD,LONGITUDE,LATITUDE,Nom,Adresse
0,67250,3.842823,36.537640,FOUNES HAKIM,OUACIF CENTRE
1,65126,4.059238,36.702653,COFFEE TIME,BVD LES BOUZARD KRIM BELKACEM
2,65115,3.959820,36.723993,RABHI SAMIR,COMMUNE AIN ZAOUIA CENTRE
3,65113,4.366767,36.666388,SADJI YOUCEF,MAKOUDA CENTRE
4,65061,3.955950,36.734243,AMAR CHERGUI,RUE DES FRERES MAMMERI
...,...,...,...,...,...
193,37785,4.208080,36.678827,CHEBAH RABAH,YAKOURENE CENTRE
194,37777,4.206327,36.526737,AGOUAZI KAMEL,YAKOURENE CENTRE
195,37775,4.206720,36.528201,DARANI SOUFIANE,YAKOURENE CENTRE
196,37766,4.337410,36.748385,TIACHTINE RACHID,TAZAGHARTE AZAZGA


**CLEAN string for Analysis**

In [18]:
cmd_golden["ADRESSE_CLEAN"]=cmd_golden.apply(lambda p:make_text_prep_func(p, address_column_blacklist, address_column_regex_replace,"Adresse"),axis=1)

In [19]:
cmd_golden["NOM_CLEAN"]=cmd_golden.apply(lambda p:make_text_prep_func(p, name_column_blacklist, name_column_regex_replace,"Nom"),axis=1)

# Data :TripAdvisor

**Reading**

In [20]:
google_restaurant ="restaurant_250m.xlsx"

In [24]:
google_restaurant=pd.read_excel(tripadvisor_data_uri)
google_restaurant

Unnamed: 0.1,Unnamed: 0,business_type,place_name,place_id,place_latitude,place_longitude,place_address,place_circle,place_user_ratings_total,place_rating,place_types
0,0,restaurant,El Gargabia,ChIJHc8k6jh1jRIRLWmmBvncSRQ,36.910246,4.431294,"N24, Azeffoun",2,3.0,5.0,"['restaurant', 'food', 'point_of_interest', 'e..."
1,1,restaurant,El Gargabia,ChIJHc8k6jh1jRIRLWmmBvncSRQ,36.910246,4.431294,"N24, Azeffoun",3,3.0,5.0,"['restaurant', 'food', 'point_of_interest', 'e..."
2,2,restaurant,La terrasse du bateau,ChIJFwvyn6-VjRIRbJk8CtjKvrA,36.902533,4.148954,"W43X+2H8, Iflissen",28,1.0,5.0,"['restaurant', 'food', 'point_of_interest', 'e..."
3,3,restaurant,Panorama restaurant,ChIJ959ZiX6ZjRIRtlQrSCmb6Jw,36.904278,4.296403,N24,49,10.0,4.1,"['restaurant', 'food', 'point_of_interest', 'e..."
4,4,restaurant,AZEFFOUN,ChIJ____uBJ1jRIRUqOFlI-Qhb0,36.903226,4.419013,PORT D'AZEFFOUN,51,110.0,4.4,"['restaurant', 'food', 'point_of_interest', 'e..."
...,...,...,...,...,...,...,...,...,...,...,...
814,814,restaurant,GÎte Thaletat,ChIJixfijzxTjBIRVoAQ5_hs4tQ,36.497225,4.235737,"F6WP+V7X, Ouacifs",19513,21.0,3.6,"['restaurant', 'food', 'point_of_interest', 'e..."
815,815,restaurant,L'escale de djurdjura,ChIJke2BrhlTjBIRnZF3X6sV700,36.495591,4.246338,"N30, Iboudraren",19516,,,"['restaurant', 'food', 'point_of_interest', 'e..."
816,816,restaurant,Restaurant Talwit,ChIJb3QqcNY1jBIRWDeIcll0_kI,36.491111,3.988154,"N30B, Boghni",19753,,,"['restaurant', 'food', 'point_of_interest', 'e..."
817,817,restaurant,Caféteria,ChIJd6qc8-ozjBIRPvZBki2vT4U,36.476472,3.968891,"FXG9+HHJ, Aït Koufi",20159,1.0,3.0,"['bar', 'restaurant', 'food', 'point_of_intere..."


**clean  addresses and names**

In [25]:
google_restaurant["name_CLEAN"]=ta.apply(lambda p:make_text_prep_func(p, name_column_blacklist, name_column_regex_replace,"place_name"),axis=1)

In [26]:
google_restaurant["address_CLEAN"]=ta.apply(lambda p:make_text_prep_func(p, address_column_blacklist, address_column_regex_replace,'place_address'),axis=1)

In [34]:
google_restaurant.columns

Index(['Unnamed: 0', 'business_type', 'place_name', 'place_id',
       'place_latitude', 'place_longitude', 'place_address', 'place_circle',
       'place_user_ratings_total', 'place_rating', 'place_types', 'name_CLEAN',
       'address_CLEAN'],
      dtype='object')

## similarity analysis

**cross join Google maps data and cmd golden**

In [31]:
def match_join(l_sdf,
               l_id,
               l_lon,
               l_lat,
               l_name,
               l_addr,
               r_sdf,
               r_id,
               r_lon,
               r_lat,
               r_name,
               r_addr,
               distance_threshold_m,
               minimal = True
              ):
  l_slice = l_sdf[[l_id, l_lon, l_lat, l_name, l_addr,"Nom", "Adresse"]]
  r_slice = r_sdf[[r_id, r_lon, r_lat, r_name, r_addr,"place_name", "place_address"]]
  l_slice.columns= "L_"+l_slice.columns
  r_slice.columns = "R_"+ r_slice.columns
  l_slice['key'] = 1
  r_slice['key'] = 1
  
# to obtain the cross join we will merge on 
# the key and drop it.
 
  inner_joined = l_slice.merge(r_slice, on ='key').drop("key", 1)
#  l_joined = l_slice.join(inner_joined, l_slice.columns)
  return(inner_joined)

In [35]:
matched = match_join(cmd_golden, "CUSTOMER_COD", "LONGITUDE", "LATITUDE", "NOM_CLEAN", "ADRESSE_CLEAN", google_restaurant, "place_id", "place_longitude", "place_latitude", "name_CLEAN", "address_CLEAN", 2000)

In [36]:
matched.nunique()

L_CUSTOMER_COD       198
L_LONGITUDE          198
L_LATITUDE           198
L_NOM_CLEAN          198
L_ADRESSE_CLEAN      166
L_Nom                198
L_Adresse            171
R_place_id           593
R_place_longitude    591
R_place_latitude     591
R_name_CLEAN         546
R_address_CLEAN      459
R_place_name         571
R_place_address      460
dtype: int64

**Compute distance between TripAdvisor outlets and cmd golden outlets**

In [39]:
matched["dist_m"]=matched.apply(lambda p:haversine_distance(p),axis=1)

In [40]:
matched.shape

(162162, 15)

In [None]:
#matched=matched.loc[matched["dist_m"]<=2000]
matched["dist_m"].hist()

In [41]:
distance_threshold_m=2000

**distance similarity**

In [42]:
matched["dist_similarity"] =(distance_threshold_m - matched["dist_m"])/distance_threshold_m

In [43]:
matched


Unnamed: 0,L_CUSTOMER_COD,L_LONGITUDE,L_LATITUDE,L_NOM_CLEAN,L_ADRESSE_CLEAN,L_Nom,L_Adresse,R_place_id,R_place_longitude,R_place_latitude,R_name_CLEAN,R_address_CLEAN,R_place_name,R_place_address,dist_m,dist_similarity
0,67250,3.842823,36.537640,foun hakim,ouac centr,FOUNES HAKIM,OUACIF CENTRE,ChIJHc8k6jh1jRIRLWmmBvncSRQ,4.431294,36.910246,el gargabi,n24 azeffoun,El Gargabia,"N24, Azeffoun",66838.15,-32.419075
1,67250,3.842823,36.537640,foun hakim,ouac centr,FOUNES HAKIM,OUACIF CENTRE,ChIJHc8k6jh1jRIRLWmmBvncSRQ,4.431294,36.910246,el gargabi,n24 azeffoun,El Gargabia,"N24, Azeffoun",66838.15,-32.419075
2,67250,3.842823,36.537640,foun hakim,ouac centr,FOUNES HAKIM,OUACIF CENTRE,ChIJFwvyn6-VjRIRbJk8CtjKvrA,4.148954,36.902533,terr bateau,w43x 2h8 iflissen,La terrasse du bateau,"W43X+2H8, Iflissen",48895.38,-23.447690
3,67250,3.842823,36.537640,foun hakim,ouac centr,FOUNES HAKIM,OUACIF CENTRE,ChIJ959ZiX6ZjRIRtlQrSCmb6Jw,4.296403,36.904278,panoram,n24,Panorama restaurant,N24,57414.22,-27.707110
4,67250,3.842823,36.537640,foun hakim,ouac centr,FOUNES HAKIM,OUACIF CENTRE,ChIJ____uBJ1jRIRUqOFlI-Qhb0,4.419013,36.903226,azeffoun,port azeffoun,AZEFFOUN,PORT D'AZEFFOUN,65497.29,-31.748645
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162157,37757,4.305728,36.574091,sagh youcef,ru colonel amirouch azazg,SAGHI YOUCEF,RUE COLONEL AMIROUCHE AZAZGA,ChIJixfijzxTjBIRVoAQ5_hs4tQ,4.235737,36.497225,git thaletat,f6wp v7x ouac,GÎte Thaletat,"F6WP+V7X, Ouacifs",10590.38,-4.295190
162158,37757,4.305728,36.574091,sagh youcef,ru colonel amirouch azazg,SAGHI YOUCEF,RUE COLONEL AMIROUCHE AZAZGA,ChIJke2BrhlTjBIRnZF3X6sV700,4.246338,36.495591,escal djurdjur,n30 iboudraren,L'escale de djurdjura,"N30, Iboudraren",10215.06,-4.107530
162159,37757,4.305728,36.574091,sagh youcef,ru colonel amirouch azazg,SAGHI YOUCEF,RUE COLONEL AMIROUCHE AZAZGA,ChIJb3QqcNY1jBIRWDeIcll0_kI,3.988154,36.491111,talw,n30b boghn,Restaurant Talwit,"N30B, Boghni",29836.86,-13.918430
162160,37757,4.305728,36.574091,sagh youcef,ru colonel amirouch azazg,SAGHI YOUCEF,RUE COLONEL AMIROUCHE AZAZGA,ChIJd6qc8-ozjBIRPvZBki2vT4U,3.968891,36.476472,cafeteri,fxg9 hhj ait kouf,Caféteria,"FXG9+HHJ, Aït Koufi",31995.73,-14.997865


**addresses and names similarities**

In [44]:
matched["name_similarity"]=matched.apply(lambda p:compound_similarity(p,"L_NOM_CLEAN","R_name_CLEAN"),axis=1)
matched["address_similarity"]=matched.apply(lambda p:compound_similarity(p,"L_ADRESSE_CLEAN","R_address_CLEAN"),axis=1)

**similarity overall**

In [45]:
matched["similarity"]= matched["name_similarity"]*0.15 +  matched["dist_similarity"]*0.8+matched["address_similarity"]*0.05

**Rank by Customer ID**

In [46]:
matched["rank"]=matched.groupby(by="L_CUSTOMER_COD")["similarity"].rank("dense", ascending=False)

In [47]:
(matched["rank"]<=15).value_counts()

False    158119
True       4043
Name: rank, dtype: int64

In [48]:
matched_filter=matched.loc[matched["rank"]<=15]

In [49]:
matched_filter.to_excel("C:/Users/Salif SAWADOGO/OneDrive - EQUATORIAL COCA-COLA BOTTLING COMPANY S.L/dynamic segmentation/matching/output/manual_match_google.xlsx")

In [None]:
matched.L_CUSTOMER_COD.nunique()

In [None]:
cmd_golden.CUSTOMER_COD.nunique()

In [None]:
#Marrakech city shape to filter for
tizi_ouzou = gpd.read_file(os.path.join("C:/Users/Salif SAWADOGO/OneDrive - EQUATORIAL COCA-COLA BOTTLING COMPANY S.L/dynamic segmentation/urbanicty/Tizi ouzou shapefile", "TZ.shp")) 

In [None]:
sub_set = gpd.sjoin(tizi_ouzou,temp, op="intersects")

In [None]:
from matplotlib import pyplot as plt
fig, ax = plt.subplots(figsize=(10, 10)) 
sub_set = gpd.sjoin(tizi_ouzou,ta_gdf, op="intersects")
sub_set.plot(ax=ax, color='darkred', lw=0.5)
tizi_ouzou.geometry.boundary.plot(color=None,edgecolor='k',linewidth = 1,ax=ax) 
gpd.GeoDataFrame(sub_set, 
                 geometry=gpd.points_from_xy(sub_set["location_lon"],sub_set["location_lat"]), 
                 crs="epsg:4326").plot(ax=ax,marker="o",color="red")

In [None]:
#algeria=gpd.read_file("D:dynamic segmentation/algeria census/data/algeria_administrative_level_data/dza_admbnda_adm1_unhcr_20200120.shp")

data_fruital=algeria.set_index("ADM1_EN")

fruital=["Alger",'Tizi Ouzou','Boumerdes','Blida','Medea','Tipaza','Bouira',"Bordj Bou Arrer",'Ain-Defla','Djelfa','Ghardaia','Laghouat','Tamanrasset',"M'Sila",'Chlef','Ouargla']
data_fruital=data_fruital.loc[fruital]
data_fruital=data_fruital.reset_index()

In [None]:
from matplotlib import pyplot as plt
fig, ax = plt.subplots(figsize=(10, 10)) 
sub_set = gpd.sjoin(algeria,ta_gdf, op="intersects")
sub_set.plot(ax=ax, color='None', lw=0.5)
c=data_fruital.plot(column='ADM1_EN',
                ax=ax,color="darkred")
algeria.geometry.boundary.plot(color=None,edgecolor='k',linewidth = 1,ax=ax) 
sub_set2=sub_set.merge(temp, how='inner')
#gpd.GeoDataFrame(sub_set, 
#                 geometry=gpd.points_from_xy(sub_set["location_lon"],sub_set["location_lat"]), 
#                 crs="epsg:4326").plot(ax=ax,marker="o",color="red",column="id")
for x, y, label in zip(sub_set2.geometry.centroid.x, sub_set2.geometry.centroid.y, sub_set2["count poi horeca"]):
    ax.annotate(label, xy=(x, y), xytext=(1, 1),textcoords="offset points")

In [None]:
temp=sub_set.groupby("ADM1_EN")['id'].count().\
    reset_index().\
    rename(columns={"id":"count poi horeca"}).\
    sort_values(by="count poi horeca",ascending=False)

In [None]:
sub_set

In [None]:
ta_gdf = gpd.GeoDataFrame(ta, geometry=gpd.points_from_xy(ta["location_lon"], ta["location_lat"]), crs="epsg:4326")

In [None]:
sub_set.shape

In [None]:
import pandas
data = pandas.read_csv("D:/data_quality/data/customer_invoice_tizi_ouzou.csv")

In [None]:
data['CHANNEL_CUSTOM'] = data['Détail Canal'].replace(['Frui-ALIMENTATION GE','Frui-SUPERETTE ET LI'], 'AG')
data['CHANNEL_CUSTOM'] = data['CHANNEL_CUSTOM'].replace(['Frui-CREMERIE','Frui-RESTAURANT / RO' ,'Frui-CAFE/CAFETERIA/','Frui-NIGHT CLUB','Frui-HOTELS',], 'HORECA')
data['CHANNEL_CUSTOM'] = data['CHANNEL_CUSTOM'].replace(['Frui-FAST FOOD / PIZ', 'Frui-PIZZERIA'], 'SNACK')
data['CHANNEL_CUSTOM'] = data['CHANNEL_CUSTOM'].replace(['Frui-DOUCHE','Frui-BUREAUX DE TABA','Frui-MOUKASSIRAT',"Frui-PATISSERIES",'Frui-FOYER','Frui-CREMERIE',"Frui-LOISIR",'Frui-SALLE DES FETES','Frui-MDN',"Frui-loisir","Frui-ADMINISTRATION","Frui-CYBER CAFE"], 'OTHER')

In [None]:
data=data.loc[~data["Classification client"].isin(["Platinum","Prestigieux"])]
data=data.loc[~data["CHANNEL_CUSTOM"].isin(["AG"])]


pandas.crosstab(data["Classification client"], data.CHANNEL_CUSTOM, margins=True)

## SIMILARITY ANALYSIS   : features importances