In [None]:
# -------------------------- LIBRARIES NECESSARY IN THIS PROJECT  -------------------------- #

import pandas as pd
import numpy as np
import seaborn as sns
from datetime import *
import zipfile
import pylab as plt

# -------------------------- FUNCTIONS -------------------------- #

def getDataSet(data_path):
    return pd.read_csv(data_path)

def plotNaNValuesColums(dataSet):
    # Gets the dataSet Columns with null values
    #  and the Percentual of null values per column    
    missing_values = (dataSet.isnull().sum() / len(dataSet)) * 100    
    missing_values = missing_values[missing_values > 0]
    missing_values.sort_values(inplace=True)   

    missing_values = missing_values.to_frame()
    missing_values.columns = ['Percentual']
    missing_values.index.names = ['Name']
    missing_values['Column'] = missing_values.index

    # Plots the graph
    sns.set(style="whitegrid", color_codes=True)
    sns.barplot(x = 'Column', y = 'Percentual', data=missing_values)
    plt.xticks(rotation = 90)
    plt.show()
    print(missing_values)
    
def plotNotNaNValuesColums(dataSet):
    # Gets the dataSet Columns with null values
    #  and the Percentual of null values per column    
    missing_values = (dataSet.notnull().sum() / len(dataSet)) * 100    
    missing_values = missing_values[missing_values > 0]
    missing_values.sort_values(inplace=True)   

    missing_values = missing_values.to_frame()
    missing_values.columns = ['Percentual']
    missing_values.index.names = ['Name']
    missing_values['Column'] = missing_values.index

    # Plots the graph
    sns.set(style="whitegrid", color_codes=True)
    sns.barplot(x = 'Column', y = 'Percentual', data=missing_values)
    plt.xticks(rotation = 90)
    plt.show()
    print(missing_values)

## Upload dataSets

### Members dataset cleasing

In [None]:
# -------------------------- PATH OF THE DATASETS USED IN THE PROJECT  -------------------------- #
zf = zipfile.ZipFile('../dataSets/main.zip')

main_data_set_all_fields = pd.read_csv(zf.open('main.csv'))

## Data manipulation and cleansing

In [None]:
# Check the dataSet fields
main_data_set_all_fields.info()

In [None]:
main_data_set_all_fields.head(4)

In [None]:
main_data_No_NaN = main_data_set_all_fields.replace(np.nan, 'NA')

main_data_No_NaN['updated_at'] = pd.to_datetime(main_data_No_NaN['updated_at']).dt.strftime('%Y-%m')

In [None]:
# Select just the fields to be used
main_data_set = main_data_No_NaN[{
                                         "member_key"
                                       , "updated_at" 
                                       #, "first_name"
                                       #, "last_name"
                                       , "budget_level1_key"
                                       , "budget_level1_name"
                                       , "plan_start_date"
                                       , "plan_end_date"
                                       , "plan_status"
                                       , "funded_amount"
                                       , "allocated_amount"
                                       , "allocated_amount_total"
                                       }]

In [None]:
main_data_set = main_data_set[(main_data_set["plan_status"] == 'PLAN_DELIVERY_ACTIVE') & (main_data_set["funded_amount"] > 0)]

In [None]:
months_total = main_data_set[{
                              "member_key"
                            , "plan_start_date"
                            , "plan_end_date"}].groupby([
                                                        "member_key"
                                                      , "plan_start_date"
                                                      , "plan_end_date"
                                                        ]).count().reset_index()

months_total['plan_months_total'] = ((pd.to_datetime(months_total['plan_end_date']).dt.year -  pd.to_datetime(months_total['plan_start_date']).dt.year) * 12) + (pd.to_datetime(months_total['plan_end_date']).dt.month -  pd.to_datetime(months_total['plan_start_date']).dt.month)

months_total_merged = months_total[{"member_key", "plan_months_total"}]
                            
months_total_merged = months_total[{
                                    "member_key"
                                  , "plan_months_total"
                            }].groupby([
                                       "member_key"
                                     , "plan_months_total"
                                       ]).count().reset_index()

months_total_merged = pd.merge(main_data_set, months_total_merged, how="left", on=["member_key"])   

In [None]:
allocated_amount_month = months_total_merged.groupby([
                                                        "member_key"
                                                      , "budget_level1_key"
                                                      #, "budget_level2_key"
                                                      #, "budget_level3_key"
                                                      , "plan_months_total"
                                                        ]).agg({"allocated_amount": "sum"}).reset_index()

allocated_amount_month["allocated_amount_month"] = allocated_amount_month["allocated_amount"] / allocated_amount_month["plan_months_total"]                                                    
allocated_amount_month = allocated_amount_month[{'member_key', 'budget_level1_key', 'allocated_amount_month'}]

months_total_merged_monthly = pd.merge(months_total_merged, allocated_amount_month, how="left", on=["member_key", "budget_level1_key"])   

In [None]:
month_today = main_data_set[{
                           "member_key"
                         , "budget_level1_key"
                         , "plan_start_date"
                            }].groupby([
                                       "member_key"
                                     , "budget_level1_key"
                                     , "plan_start_date"
                                       ]).count().reset_index()

month_today['month_actual'] = ((pd.to_datetime("today").year -  pd.to_datetime(month_today['plan_start_date']).dt.year) * 12)                                                                                                                     + (pd.to_datetime("today").month - pd.to_datetime(month_today['plan_start_date']).dt.month)


month_today = month_today[{'member_key', 'budget_level1_key', 'month_actual'}]

month_actual = pd.merge(months_total_merged_monthly, month_today, how="left", on=["member_key", "budget_level1_key"])

month_actual["allocated_amount_predicted"] = month_actual["allocated_amount_month"] * month_actual["month_actual"]

In [None]:
month_actual['allocated_amount_percentage'] = (month_actual['allocated_amount'] / month_actual['allocated_amount_total']) * 100
month_actual['funded_amount_percentage'] = (month_actual['funded_amount'] / month_actual['allocated_amount']) * 100
month_actual['funded_total_percentage'] = (month_actual['funded_amount'] / month_actual['allocated_amount_total']) * 100
month_actual['funded_amount_ON_amount_predicted'] = (month_actual['funded_amount'] / month_actual['allocated_amount_predicted']) * 100

In [None]:
month_actual = month_actual[month_actual["plan_months_total"] >= 12]

In [None]:
main_data_set = month_actual.groupby([
                                       "member_key"
                                      #, "first_name"
                                      #, "last_name"
                                      , "budget_level1_key"
                                      , "budget_level1_name"
                                      , "plan_months_total"
                                      , "month_actual"
                                      , "allocated_amount_total"
                                      , "allocated_amount_predicted"
                                      ]).agg({"funded_amount": "sum"
                                            , "allocated_amount": "sum"
                                            #, "funded_amount_percentage": "sum"
                                            #, "funded_total_percentage": "sum"
                                            , "funded_amount_ON_amount_predicted": "sum"
                                            , "allocated_amount_percentage": "sum"
                                            }).reset_index()

In [None]:
main_data_set['spending_status'] = np.where((main_data_set['funded_amount'] < main_data_set['allocated_amount_predicted']) , 0, 1)

In [None]:
compression_opts = dict(method='zip',
                        archive_name='prediction.csv')

main_data_set.to_csv('../dataSets/prediction.zip', compression=compression_opts)