In [1]:
#%matplotlib inline
import datetime
from dateutil import parser
import math
from dateutil.parser import parse
from geopandas import GeoDataFrame
import pandas as pd
import numpy as np
from shapely.geometry import Point
import json
import ast
import sys
import pickle
from __future__ import print_function
import os
import time
import collections
import copy

os.chdir("/Users/sergiocamelo/Dropbox/Sergio-Joann/")

sys.path.insert(0, 'Code/VRPEngine/pyCode')
sys.path.insert(0, 'Code/VRPEngine/C++Engine')
sys.path.insert(0, 'Code/VRPEngine/pyCode/tsp')

import solver as solver
import distances as distances
import VRPClass

In [2]:
results_folder = 'Results'
data_folder = results_folder+st+"/"

# The folder which will be used to write the bash scripts
folder_project = ''

In [29]:
# Parameters for the run
season = "high"
period = 3*28 # Use three month period
days_analysis = 14
st = 'Jan132018'
farmers_data = 'dim_farmers_sample.csv'

In [31]:
# Create random timestamped folder
os.mkdir(results_folder+"_"+st+"/")


OSError: [Errno 17] File exists: 'Results/Jan132018/'

In [4]:
folder_data_cleaning_results = 'data_cleaning_results' + st
try:
    os.mkdir(results_folder+"_"+st+"/"+folder_data_cleaning_results+'/')
except:
    print("Folder already created")
f = open('StandardizedData/%s/report.txt'%(folder_data_cleaning_results), 'w+')


Folder already created


In [5]:
open(data_folder + '/errors.txt', 'a').close()

In [6]:
season_volume = season + '_volume'

In [7]:
# Load datasets
dim_farmers = pd.read_csv(data_folder+farmers_data)
dim_middlemen = pd.read_csv(data_folder+'dim_middlemen.csv')
dim_mills = pd.read_csv(data_folder+'dim_mills.csv')
harvest_frequency_mapping = pd.read_csv(data_folder+'harvest_frequency_mapping.csv')
print("Number of plantations: %d" % (len(dim_farmers)),file=f)
print("Number of middlemen: %d" % (len(dim_middlemen)),file=f)

In [8]:
# Create a unique identifier for the farmer plot
dim_farmers['plot_id'] = [dim_farmers['farmer_id'][i] + '-'+str(dim_farmers['plot_number'][i]) for i in range(len(dim_farmers))]

In [9]:
# Delete repetitions
old_len = len(dim_farmers)
dim_farmers = copy.deepcopy(dim_farmers[-dim_farmers['plot_id'].duplicated()]).reset_index(drop=True)
print("Found %d repetitions"%(old_len-len(dim_farmers)))

Found 0 repetitions


In [10]:
# Create a dictionary of the farmers to cluster
farmer_to_cluster = dict(zip(dim_farmers['plot_id'],dim_farmers['cluster_id']))

In [11]:
# Calculate middleman capacity
dim_middlemen['trucks_dict']  = dim_middlemen['trucks'].map(lambda d:ast.literal_eval(d))
dim_middlemen['capacity'] = dim_middlemen['trucks_dict'].map(lambda d:np.sum([int(t)*d[t] for t in d.keys()]))

In [12]:
# Join farmers and middlemen data
# Change names of columns
dim_farmers = dim_farmers.rename(index=str, columns={"latitude": "latitude_farmer", "longitude": "longitude_farmer"})
dim_middlemen = dim_middlemen.rename(index=str, columns={"latitude": "latitude_middleman", "longitude": "longitude_middleman"})
dim_mills = dim_mills.rename(index=str, columns={"latitude": "latitude_mill", "longitude": "longitude_mill"})

result = pd.merge(dim_farmers, dim_middlemen, on=['cluster_id'], how='inner')
print("Number of plantations with middleman: %d" % (len(result)),file=f)

In [13]:
# Check if there are any duplicates
result[result.duplicated(subset=['farmer_id','plot_number'], keep=False)].to_csv(data_folder+folder_data_cleaning_results+'/duplicates.csv')
print("Total of duplicates: %d" % (len(result[result.duplicated(subset=['farmer_id','plot_number'], keep=False)])),file=f)
print ("Duplicates saved in data_cleaning_results/duplicates.csv",file=f)

In [14]:
#Use only data with lat_lon and with productions
df_full = result[np.logical_and(pd.notnull(result['longitude_farmer']),pd.notnull(result['latitude_farmer']))].copy()
print("Number of plantations with latlon: %d" % (len(df_full)),file=f)
# Use data with productions
df_full = df_full[df_full[season+'_rate']!=0].copy()
df_full = df_full[pd.notnull(df_full[season+'_rate'])].copy()
df_full = df_full[df_full[season+'_volume']!=0].copy()
print("Number of plantations that produce during the season : %d" % (len(df_full)),file=f)

In [15]:
# Map number of days
harvest_frequency_mapping_dict = {row[0]:row[1] for i,row in harvest_frequency_mapping.iterrows()}
df_full['rate'] = df_full[season+'_rate'].map(harvest_frequency_mapping_dict)

In [16]:
# Has pickup date 
df_full = df_full[pd.notnull(df_full['date_last_sold'])].copy()
print("Number of plantations with last date: %d" % (len(df_full)),file=f)

In [17]:
# Generate days of pickup
ref_day = datetime.datetime.strptime('1/3/2000', "%m/%d/%Y")
days = np.array([(parse(v)-ref_day).days for v in df_full['date_last_sold'].values])
df_full['day_mod'] = days%period
def calculate_pickup_days(row):
    d = row['day_mod']
    freq = row['rate']
    l = []
    for i in range(int(period/freq)):
        l.append((d + i * freq)%period)
    return l
df_full['pickup_days'] = df_full.apply(calculate_pickup_days, axis=1)

In [18]:
# Explote data
clusters = np.unique(df_full['cluster_id'])
df_exploted = pd.merge(df_full,df_full.pickup_days.apply(pd.Series).stack().reset_index(level=1, drop=True).to_frame('pickup'),left_index=True, right_index=True)
for c in clusters:
    print("Number of plantations of cluster %d: %d" % (c,len(df_full[df_full['cluster_id'] == c].copy())),file=f)
df_clusters = df_exploted[(np.array([c in clusters for c in df_exploted['cluster_id']])) & (df_exploted['pickup'] < days_analysis)].copy()

In [19]:
# Print middlemen data in the results folder
dim_middlemen[[dim_middlemen['cluster_id'][i] in clusters for i in range(len(dim_middlemen))]][['cluster_id','trucks','mills','capacity']].to_csv(data_folder+folder_data_cleaning_results+"/middlemen_data.csv", index=False)

In [20]:
# Calculate total capacity
dict_comparisons={}
for c in clusters:
    dict_comparisons[c] = {}
    dict_comparisons[c]['capacity'] = dim_middlemen[dim_middlemen.cluster_id == c]['capacity'].iloc[0]

In [21]:
# Round producing quantities to the decimal up
df_clusters[season + '_volume'] = np.ceil(df_clusters[season + '_volume'] * 10)/10

In [22]:
# Number of plantations picked up each day and quantities picked up each day
# Calculate the number of days
agg_quant = df_clusters.groupby(['cluster_id','pickup']).agg({'farmer_id':'count', season+'_volume': 'sum'})
agg_quant['overload']=agg_quant[season+'_volume']-agg_quant.apply(lambda r:dict_comparisons[r.name[0]]['capacity'],1)
outliers = (agg_quant[agg_quant['overload']>0])
print(outliers)

                   farmer_id  high_volume  overload
cluster_id pickup                                  
13.0       0.0            13         23.6       3.6
           1.0             5         20.4       0.4
           3.0            10         21.6       1.6
           13.0            7         21.4       1.4
41.0       6.0             4         14.7       4.7
46.0       2.0             6         23.4      12.4
           6.0             8         26.1      15.1
           9.0             8         32.7      21.7
51.0       9.0             5         10.3       1.3
           13.0            5         13.1       4.1
60.0       2.0             8         18.6       0.6
           3.0             6         23.7       5.7
           4.0            11         20.4       2.4
           7.0             9         25.8       7.8
           8.0             8         41.5      23.5
108.0      2.0            13         21.2       3.2
199.0      5.0             3         14.0       3.0
241.0      1

In [23]:
# Create a report of inconsistent data
report = []
for index,row in outliers.iterrows():
    report.append({'cluster':index[0],
                   'farmers':row['farmer_id'],
                         season+'_volume':row[season+'_volume'],
                         'capacity':row[season+'_volume']-row['overload'],
                        'farmer_id-plot':[r['farmer_id']+'-'+str(r['plot_number']) for j,r in df_clusters.iterrows() if (r['cluster_id']==index[0] and r['pickup']==index[1])]})
print("Found %d trucks carrying more than their capacity" % len(outliers),file=f)
pd.DataFrame(report).to_csv(data_folder+folder_data_cleaning_results+'/overcapacity.csv')
print("Report saved in data_cleaning_results/overcapacity.csv",file=f)

In [24]:
# Create a CSV that stores the plot id, the quantity, the rate and the pickup-days
summarized_dict = collections.defaultdict(dict)
for i,row in df_clusters.iterrows():
    if 'farmer_id' in summarized_dict[row['plot_id']]:
        assert(summarized_dict[row['plot_id']]['farmer_id'] == row['farmer_id'])
    else:
        summarized_dict[row['plot_id']]['farmer_id'] = row['farmer_id']

    if 'rate' in summarized_dict[row['plot_id']]:
        assert(summarized_dict[row['plot_id']]['rate'] == row['rate'])
    else:
        summarized_dict[row['plot_id']]['rate'] = row['rate']
        
    if 'volume' in summarized_dict[row['plot_id']]:
        assert(summarized_dict[row['plot_id']]['volume'] == row[season+"_volume"])
    else:
        summarized_dict[row['plot_id']]['volume'] = row[season+"_volume"]
        
    if 'cluster_id' in summarized_dict[row['plot_id']]:
        if (row['farmer_id'] not in ['F14020080487','F14020080660','F14020080219','F14020080253']):
            assert(summarized_dict[row['plot_id']]['cluster_id'] == row['cluster_id'])
    else:
        summarized_dict[row['plot_id']]['cluster_id'] = row['cluster_id']
        
    summarized_dict[row['plot_id']]['latitude'] = row['latitude_farmer']
    summarized_dict[row['plot_id']]['longitude'] = row['longitude_farmer']
    
    if 'pickups' not in summarized_dict[row['plot_id']]:
        summarized_dict[row['plot_id']]['pickups'] = [row['pickup']]
    else:
        summarized_dict[row['plot_id']]['pickups'] = summarized_dict[row['plot_id']]['pickups'] + [row['pickup']]
        
pd.DataFrame(summarized_dict).transpose().to_csv(data_folder+folder_data_cleaning_results+'/dim_pickups.csv',index_label = "plot_id")

In [25]:
# Create trucks dataset (Only run once)
truck_dicts = []
j = 0
for i,r in dim_middlemen.iterrows():
    truck_dic = ast.literal_eval(r['trucks'])
    for capacity in truck_dic.keys():
        for i in range(truck_dic[capacity]):
            truck_dicts.append({
                    "cluster_id":r['cluster_id'],
                    "truck_id":"t_"+str(j),
                    "capacity":int(capacity)
                })
            j += 1
pd.DataFrame(truck_dicts).to_csv(data_folder+folder_data_cleaning_results+"/dim_trucks.csv",index=False)

In [26]:
# Load the trucks dataset
dim_trucks = pd.read_csv(data_folder+folder_data_cleaning_results+"/dim_trucks.csv")

In [27]:
# Create a dataset with code to position
positions_dict = {}
for i, row in dim_mills.iterrows():
    positions_dict[row["code"]] = (row['latitude_mill'], row['longitude_mill'])
for i, row in dim_farmers.iterrows():
    positions_dict[row["farmer_id"]+'-'+str(row["plot_number"])] = (row['latitude_farmer'], row['longitude_farmer'])
for i, row in dim_middlemen.iterrows():
    positions_dict[row["cluster_id"]] = (row['latitude_middleman'], row['longitude_middleman'])
for i, row in dim_trucks.iterrows():
    positions_dict[row["truck_id"]] = positions_dict[float(row["cluster_id"])]
unique_id_to_latlon = pd.DataFrame(positions_dict).transpose()
unique_id_to_latlon.columns = ['latitude', 'longitude']
unique_id_to_latlon.to_csv(data_folder+folder_data_cleaning_results+"/unique_id_to_latlon.csv")

In [28]:
f.close()