In [1]:
# This code updates the current clusters
# It assumes we already have a mapping which comes from cluster_mapping_04192018.csv
# Then it takes this and any correction, and outputs a new file with the corresponding clusters
%matplotlib inline
import re
from fuzzywuzzy import fuzz
import matplotlib.pyplot as plt

import datetime
from dateutil import parser
from geopandas import GeoDataFrame
import pandas as pd
import numpy as np
from shapely.geometry import Point

from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN



In [2]:
# Load the data
data_clusters = pd.read_csv('/Users/sergiocamelo/Dropbox/Sergio-Joann/Names/Decisions/cluster_mapping_04192018.csv')

In [3]:
# Create a dictionary
clusters = {}
names = data_clusters['Buyer Name'].values
keys = data_clusters['cluster'].values
names_l = [str(n).lower() for n in names]
names_l = [n.strip() for n in names_l]
names_l = [re.sub('\s+', ' ', n) for n in names_l]
names_s = names_l
for i,n in enumerate(names_l):
    clusters[n] = keys[i]

In [4]:
# Get new names from dataset and check what is missing (if there is anything)
buyer_data = pd.read_csv('/Users/sergiocamelo/Dropbox/Sergio-Joann/20180412_Unilever_Main_Buyer_Progress_Data-CLEAN.csv')
buyer_data = buyer_data[['Member ID','Buyer Type','Buyer Name','Plot Nr']].copy()
buyer_data = buyer_data.rename(index=str, columns={'Member ID':"id", 'Buyer Type':"type", "Buyer Name":"buyer_name", 'Plot Nr':"plot", 'FFB Last Sold Date':"last_sold"})

all_names = buyer_data['buyer_name'].values
all_names = [str(n).lower() for n in all_names]
all_names = [n.strip() for n in all_names]
all_names = [re.sub('\s+', ' ', n) for n in all_names]
print(list(set(all_names) - set(names_l)))
print(list(set(names_l) - set(all_names)))
new_names = list(set(names_l) - set(all_names))

['irwan simamora (spd) (nhr)', 'skip', 'danru', 'tidak tahu nama do, toke pak jukri (pks nhr)', 'irwan simamora (sbd)(nhr)', 'ggk ( gagok)', 'yp', 'sbd ( pabrik nhr )', 'ngamprah', 'do rgn', 'lagino', 'tj(pabrik skip)', 'irwan simamora (spd)(nhr)', 'saritok', 'gagok( ggk)', 'ramp blok e', 'tj rudi', 'sijul', 'cv. 3ym', 'ggk (gagok)', 'koperasi danau rambai', 'danau rambai', 'rgn', 'buah sendiri', 'do', 'bim', 'akhhi', 'kud sember bahagia', 'gagok (ggk)', 'do rgn (suroso)', 'pn ( pelangi nusantara)', 'sup do yp', 'ggk udin', 'pt bim', 'tj ( dandru )', 'tratama', 'mustar', 'asiong', 'sarito', 'pt. nat', 'nuriyanto', 'pt.harpena', 'nhr. skip', 'pabrik bim', 'pt. nad', 'spb tj', 'tinus', 'metto', 'yp(pabrik nhr)', 'sk (irwan)', 'jon. s', 'koperasi sejahtera', 'yp sei akar (pt.nhr siberida)', 'cv 3 ym', 'udin ggk', 'jhon simarmata', 'satpam pabrik (tj)', 'abp', 'sukat abadi', 'kud', '(null)', 'rj', 'darmin', 'pt. arvena', 'sk 01', 'do samson', 'pt. inecda', 'nts', 'jl', 'koperasi sumber bah

In [5]:
# For the moment I am only considering 3 - Middleman/Agent

In [6]:
# For each row of the buyers file, I get the cluster and then construct a dataset with this
def cluster_row(row):
    name = row['buyer_name']
    name = str(name).lower()
    name = name.strip()
    name = re.sub('\s+', ' ', name)
    if name in clusters.keys():
        return clusters[name]
    else:
        return float('nan')

buyer_data['cluster'] = buyer_data.apply(cluster_row, axis=1)

In [8]:
# We make some corrections
corrections_file = pd.read_csv('/Users/sergiocamelo/Dropbox/Sergio-Joann/Names/Decisions/Changes/180520_cluster_changes_comb.csv')
corrections_file = corrections_file.rename(index=str, columns={'Farmer_ID':"id", 'Farmer_plot':"plot"})

# Left join the datasets
corrected_file = pd.merge(buyer_data,corrections_file, how='left')

# Make the corrections
max_cluster = corrected_file.cluster.max()
for index,row in corrected_file.iterrows():
    if pd.notnull(row['change_cluster']):
        corrected_file.loc[index,'cluster'] = row['change_cluster']
    elif pd.notnull(row['new_cluster']):
        corrected_file.loc[index,'cluster'] = row['new_cluster'] + max_cluster
corrected_file = corrected_file[['id','type','buyer_name','plot','cluster']]

IOError: File /Users/sergiocamelo/Dropbox/Sergio-Joann/Names/Decisions/180520_cluster_changes_comb.csv does not exist

In [79]:
corrected_file.to_csv('/Users/sergiocamelo/Dropbox/Sergio-Joann/Names/Decisions/farmer_cluster_mappings_04222018.csv', index=False)

In [80]:
corrected_file

Unnamed: 0,id,type,buyer_name,plot,cluster
0,F14040190001,3 - Middleman/Agent,Ijul,1,373.0
1,F14040190002,3 - Middleman/Agent,Hasri,1,181.0
2,F14040190003,3 - Middleman/Agent,Sam,1,20.0
3,F14040190004,3 - Middleman/Agent,Asri,1,181.0
4,F14040190005,3 - Middleman/Agent,Amin,1,175.0
5,F14040190006,3 - Middleman/Agent,Ujang,1,150.0
6,F14040190007,3 - Middleman/Agent,Giran,1,350.0
7,F14040190009,3 - Middleman/Agent,Kamin,1,175.0
8,F14040190010,3 - Middleman/Agent,Masri,1,181.0
9,F14040190012,3 - Middleman/Agent,Agus,1,108.0
