In [14]:
# -------------------------- LIBRARIES NECESSARY IN THIS PROJECT  -------------------------- #

import pandas as pd
import numpy as np
import seaborn as sns
from datetime import *
import zipfile
import pylab as plt

# -------------------------- FUNCTIONS -------------------------- #

def getDataSet(data_path):
    return pd.read_csv(data_path)

def plotNaNValuesColums(dataSet):
    # Gets the dataSet Columns with null values
    #  and the Percentual of null values per column    
    missing_values = (dataSet.isnull().sum() / len(dataSet)) * 100    
    missing_values = missing_values[missing_values > 0]
    missing_values.sort_values(inplace=True)   

    missing_values = missing_values.to_frame()
    missing_values.columns = ['Percentual']
    missing_values.index.names = ['Name']
    missing_values['Column'] = missing_values.index

    # Plots the graph
    sns.set(style="whitegrid", color_codes=True)
    sns.barplot(x = 'Column', y = 'Percentual', data=missing_values)
    plt.xticks(rotation = 90)
    plt.show()
    print(missing_values)
    
def plotNotNaNValuesColums(dataSet):
    # Gets the dataSet Columns with null values
    #  and the Percentual of null values per column    
    missing_values = (dataSet.notnull().sum() / len(dataSet)) * 100    
    missing_values = missing_values[missing_values > 0]
    missing_values.sort_values(inplace=True)   

    missing_values = missing_values.to_frame()
    missing_values.columns = ['Percentual']
    missing_values.index.names = ['Name']
    missing_values['Column'] = missing_values.index

    # Plots the graph
    sns.set(style="whitegrid", color_codes=True)
    sns.barplot(x = 'Column', y = 'Percentual', data=missing_values)
    plt.xticks(rotation = 90)
    plt.show()
    print(missing_values)

## Upload dataSets

### Members dataset cleasing

In [15]:
# -------------------------- PATH OF THE DATASETS USED IN THE PROJECT  -------------------------- #
zf = zipfile.ZipFile('../dataSets/main.zip')

main_data_set_all_fields = pd.read_csv(zf.open('main.csv'))

## Data manipulation and cleansing

In [16]:
# Check the dataSet fields
main_data_set_all_fields.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32950 entries, 0 to 32949
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              32950 non-null  int64  
 1   member_key              32950 non-null  object 
 2   updated_at              32950 non-null  object 
 3   first_name              32950 non-null  object 
 4   last_name               32950 non-null  object 
 5   disabilities            8383 non-null   object 
 6   SA1                     3751 non-null   float64
 7   price_zone_code         32950 non-null  object 
 8   plan_key                32950 non-null  object 
 9   plan_status             32950 non-null  object 
 10  plan_start_date         32950 non-null  object 
 11  plan_end_date           32950 non-null  object 
 12  budget_level3_name      840 non-null    object 
 13  budget_level3_key       840 non-null    object 
 14  budget_level2_key       32950 non-null

In [17]:
main_data_set_all_fields.head(4)

Unnamed: 0.1,Unnamed: 0,member_key,updated_at,first_name,last_name,disabilities,SA1,price_zone_code,plan_key,plan_status,...,budget_level1_key,budget_level1_name,invoice_state,claim_state,allocated_amount,requested_amount,funded_amount,allocated_amount_total,requested_amount_total,funded_amount_total
0,0,00109970-7029-11eb-81d6-9d4df94b6224,2021-04,Wynny,Pressman,,,ACT_NSW_QLD_VIC,588427f1-e45d-4a6e-b307-9c68a13849ef,PLAN_DELIVERY_ACTIVE,...,58745466-f597-11e9-bfb4-022d4762bb3c,Core,ALL_PAID,PAID,50000.0,98.8,98.8,272168.34,11316.7,11316.7
1,1,00109970-7029-11eb-81d6-9d4df94b6224,2021-04,Wynny,Pressman,,,ACT_NSW_QLD_VIC,588427f1-e45d-4a6e-b307-9c68a13849ef,PLAN_DELIVERY_ACTIVE,...,58745466-f597-11e9-bfb4-022d4762bb3c,Core,ALL_PAID,PAID,222168.34,11217.9,11217.9,272168.34,11316.7,11316.7
2,2,0010f240-3b5d-11eb-aa73-7be7bbd3c758,2021-04,Beryl,McDuff,Asperger syndrome,3138648.0,ACT_NSW_QLD_VIC,ef286fe8-8b36-4d77-97a0-59f9ef3df9b8,PLAN_DELIVERY_ACTIVE,...,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,ALL_PAID,PAID,9801.6,2559.6384,2559.67,9801.6,2559.6384,2559.67
3,3,00117a50-cbc6-11ea-805a-75fc51d39a77,2021-04,Jethro,Perchard,,,ACT_NSW_QLD_VIC,8aa26238-d922-4963-bcaf-427d24a610bc,COMPLETED,...,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,ALL_PAID,PAID,64524.0,8503.65,8503.65,64524.0,8503.65,8503.65


In [18]:
main_data_No_NaN = main_data_set_all_fields.replace(np.nan, 'NA')

main_data_No_NaN['updated_at'] = pd.to_datetime(main_data_No_NaN['updated_at']).dt.strftime('%Y-%m')

In [19]:
# Select just the fields to be used
main_data_set = main_data_No_NaN[{
                                         "member_key"
                                       , "first_name"
                                       , "last_name"
                                       , "updated_at" 
                                       , "budget_level1_key"
                                       , "budget_level1_name"
                                       , "plan_start_date"
                                       , "plan_end_date"
                                       , "plan_status"
                                       , "funded_amount"
                                       , "allocated_amount"
                                       , "allocated_amount_total"
                                       }]

In [20]:
main_data_set = main_data_set[(main_data_set["plan_status"] == 'PLAN_DELIVERY_ACTIVE') & (main_data_set["funded_amount"] > 0)]

In [21]:
months_total = main_data_set[{
                              "member_key"
                            , "plan_start_date"
                            , "plan_end_date"}].groupby([
                                                        "member_key"
                                                      , "plan_start_date"
                                                      , "plan_end_date"
                                                        ]).count().reset_index()

months_total['plan_months_total'] = ((pd.to_datetime(months_total['plan_end_date']).dt.year -  pd.to_datetime(months_total['plan_start_date']).dt.year) * 12) + (pd.to_datetime(months_total['plan_end_date']).dt.month -  pd.to_datetime(months_total['plan_start_date']).dt.month)

months_total_merged = months_total[{"member_key", "plan_months_total"}]
                            
months_total_merged = months_total[{
                                    "member_key"
                                  , "plan_months_total"
                            }].groupby([
                                       "member_key"
                                     , "plan_months_total"
                                       ]).count().reset_index()

months_total_merged = pd.merge(main_data_set, months_total_merged, how="left", on=["member_key"])   

In [22]:
allocated_amount_month = months_total_merged.groupby([
                                                        "member_key"
                                                      , "budget_level1_key"
                                                      , "plan_months_total"
                                                        ]).agg({"allocated_amount": "sum"}).reset_index()

allocated_amount_month["allocated_amount_month"] = allocated_amount_month["allocated_amount"] / allocated_amount_month["plan_months_total"]                                                    
allocated_amount_month = allocated_amount_month[{'member_key', 'budget_level1_key', 'allocated_amount_month'}]

months_total_merged_monthly = pd.merge(months_total_merged, allocated_amount_month, how="left", on=["member_key", "budget_level1_key"])   

In [23]:
month_today = main_data_set[{
                           "member_key"
                         , "budget_level1_key"
                         , "plan_start_date"
                            }].groupby([
                                       "member_key"
                                     , "budget_level1_key"
                                     , "plan_start_date"
                                       ]).count().reset_index()

month_today['month_actual'] = ((pd.to_datetime("today").year -  pd.to_datetime(month_today['plan_start_date']).dt.year) * 12)                                                                                                                     + (pd.to_datetime("today").month - pd.to_datetime(month_today['plan_start_date']).dt.month)


month_today = month_today[{'member_key', 'budget_level1_key', 'month_actual'}]

month_actual = pd.merge(months_total_merged_monthly, month_today, how="left", on=["member_key", "budget_level1_key"])

month_actual["spent_amount_predicted"] = month_actual["allocated_amount_month"] * month_actual["month_actual"]


month_actual = month_actual.loc[month_actual['allocated_amount'] >0]

In [24]:
#month_actual['allocated_amount_percentage'] = (month_actual['allocated_amount'] / month_actual['allocated_amount_total']) * 100
#month_actual['funded_amount_percentage'] = (month_actual['funded_amount'] / month_actual['allocated_amount']) * 100

#month_actual['funded_amount_ON_amount_predicted_percentag'] = (month_actual['funded_amount'] / month_actual['allocated_amount_predicted']) * 100
month_actual['spent_amount_predicted_percentage'] = (month_actual['spent_amount_predicted'] / month_actual['allocated_amount']) * 100
month_actual['funded_amount_percentage'] = (month_actual['funded_amount'] / month_actual['allocated_amount']) * 100

In [25]:
month_actual = month_actual[month_actual["plan_months_total"] >= 12]

In [26]:
#main_data_set = month_actual.groupby([
#                                       "member_key"
#                                      , "budget_level1_key"
#                                      , "budget_level1_name"
#                                      , "plan_months_total"
#                                      , "month_actual"
#                                      , "allocated_amount_total"
#                                      , "spent_amount_predicted"
#                                      ]).agg({"funded_amount": "sum"
#                                            , "allocated_amount": "sum"
#                                            , "funded_amount_ON_amount_predicted": "sum"
#                                            , "allocated_amount_percentage": "sum"
#                                            }).reset_index()

main_data_set = month_actual.groupby([
                                       "member_key"
                                      , "first_name" 
                                      , "last_name" 
                                      , "budget_level1_key"
                                      , "budget_level1_name"
                                      #, "plan_months_total"
                                      #, "month_actual"
                                      #, "allocated_amount_total"
                                      #, "spent_amount_predicted"
                                      ]).agg({
                                                "funded_amount": "sum"
                                                #, "allocated_amount": "sum"
                                                , "spent_amount_predicted_percentage": "sum"
                                                , "funded_amount_percentage": "sum"
                                            }).reset_index()

main_data_set                                            

Unnamed: 0,member_key,first_name,last_name,budget_level1_key,budget_level1_name,funded_amount,spent_amount_predicted_percentage,funded_amount_percentage
0,00109970-7029-11eb-81d6-9d4df94b6224,Wynny,Pressman,58745466-f597-11e9-bfb4-022d4762bb3c,Core,11316.70,55.570178,5.246879
1,0010f240-3b5d-11eb-aa73-7be7bbd3c758,Beryl,McDuff,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,2559.67,66.666667,26.114818
2,0011ebe0-7722-11eb-9ade-8f3368e47951,Eamon,Gammage,58745466-f597-11e9-bfb4-022d4762bb3c,Core,3629.66,33.333333,3.703735
3,0011ebe0-7722-11eb-9ade-8f3368e47951,Eamon,Gammage,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,691.27,8.333333,3.835020
4,007c1090-d6ca-11ea-bb8a-a73bf14c75f9,Janean,Potzold,58745466-f597-11e9-bfb4-022d4762bb3c,Core,4213.97,678.800562,41.358973
...,...,...,...,...,...,...,...,...
8720,ff630c80-d094-11ea-847f-196f66acbe45,Mureil,Labb,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,2069.29,91.666667,10.889372
8721,ff656e00-7fce-11eb-8e57-d94003868067,Jeramie,Burgot,58745466-f597-11e9-bfb4-022d4762bb3c,Core,204.36,16.666667,6.812000
8722,ff656e00-7fce-11eb-8e57-d94003868067,Jeramie,Burgot,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,750.25,16.666667,3.291461
8723,ffddfbc0-0861-11eb-92a0-2d24c44fa781,Allard,Postle,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,5508.97,66.666667,29.276512


In [27]:
main_data_set['spending_status'] = np.where((main_data_set['funded_amount_percentage'] < main_data_set['spent_amount_predicted_percentage']), 0, 1)
main_data_set['spending_status'] = np.where((main_data_set['funded_amount_percentage'] == main_data_set['spent_amount_predicted_percentage']), 2, main_data_set['spending_status'])

#main_data_set['spending_status'] = np.where((main_data_set['funded_amount'] < main_data_set['spent_amount_predicted']), 0, 1)

#main_data_set['spending_status'] = np.where((main_data_set['funded_amount'] == main_data_set['spent_amount_predicted']), 2, main_data_set['spending_status'])

In [28]:
compression_opts = dict(method='zip',
                        archive_name='prediction.csv')

main_data_set.to_csv('../dataSets/prediction.zip', compression=compression_opts)