In [1]:
# -------------------------- LIBRARIES NECESSARY IN THIS PROJECT  -------------------------- #
import pandas as pd
import numpy as np

# -------------------------- FUNCTIONS -------------------------- #

def getDataSet(data_path):
    return pd.read_csv(data_path)


# CHECK COLUMNS WITH NaN VALUES
def getNanDataSetColumns(dataSet):
    nan_values = dataSet.isna()
    nan_columns = nan_values.any()
    columns_with_nan = df_invoices_claims.columns[nan_columns].tolist()
    
    return columns_with_nan

# -------------------------- PATH OF THE DATASETS USED IN THE PROJECT  -------------------------- #

invoices_claims_path="dataSets/Invoices_Claims.csv"
members_supported="dataSets/Members_Supported.csv"
plans_budgets="dataSets/Plans_Budgets.csv"
providers="dataSets/Providers.csv"

## 1. Data manipulation and cleansing<a name="preparation"></a>

In [2]:
# upload the dataSet with invoices and claims data
df_invoices_claims_all_fields = getDataSet(invoices_claims_path)

# Check the invoices and claims data records
#df_invoices_claims.head(10)

# Select just the fields to be used in the project
df_invoices_claims = df_invoices_claims_all_fields[{"member_id"
                                                  , "invoice_date"
                                                  , "invoice_state"
                                                  , "invoice_id"
                                                  , "invoice_total"                                                  
                                                  , "item_category_level3_id"
                                                  , "item_level3_name" 
                                                  , "item_category_level2_id"
                                                  , "name"
                                                  , "item_category_level1_id"
                                                  , "name.1"
                                                  , "claim_id"
                                                  , "claim_funded_amount"
                                                  , "claimed_date"
                                                  , "claim_state"
                                                  }]

# Rename some dataSet colums to create a name's pattern
df_invoices_claims.rename(columns={'item_category_level2_id':'level2_id'
                                 , 'name':'level2_description'
                                 , 'item_category_level1_id':'level1_id'
                                 , 'name.1':'level1_description'
                                 , 'item_category_level3_id':'level3_id'
                                 , 'item_level3_name':'stated_item_description'
                                 , 'invoice_total':'invoice_amount'
                                 , 'claim_funded_amount':'funded_amount'                                                                  
                                  } , inplace = True)

# Consider just paided/refunded invoices or paid/refunded claims
df_invoices_claims_original = df_invoices_claims.loc[df_invoices_claims['invoice_state'].isin(['ALL_PAID'
                                                                                             , 'SHORT_PAID'
                                                                                             , 'PART_PAID'
                                                                                             , 'FULLY_REFUNDED'])

                                                     |
                              
                                                     df_invoices_claims['claim_state'].isin(['PAID'
                                                                                           , 'REFUNDED'])
                                                    ]                                                  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [21]:
df_invoices_claims = df_invoices_claims_original

# Get the name of columns with Nan values
#  and figured out that some fields are Nan values
print('Name of colums with Nan values: ')
print(getNanDataSetColumns(df_invoices_claims))
print('')

# There are 'funded_amount' filds == NaN
#   Checking their types it is seing that they are float64 typed 
df_invoices_claims.info()

# Select state of invoice and claim of all null data 
df_invoices_claims_filtered = df_invoices_claims[df_invoices_claims.isna().any(axis=1)]
df_filtered = df_invoices_claims_filtered[{"invoice_state"
                                         , "claim_state"
                                         , "invoice_id"
                                         , "claim_id"
                                         }]


# Group the state of invoice and claim of all null data
df_filtered_grouped = df_filtered.groupby(["invoice_state"
                                         , "claim_state"
                                         , "invoice_id"
                                         , "claim_id"
                                         ]).count()

# analyzing the data it was concluded that if funded_amount is null means
#  means that this claimed was not paid for some reason.
#  Ps. the invoice 488343 is an example
df_status = df_invoices_claims[df_invoices_claims.isna().any(axis=1)]
df_status[{"invoice_state", "funded_amount", "claim_state"}]

# So it is considered from now on just 'funded_amount' not null
df_invoices_claims = df_invoices_claims.dropna()

df_total_amounts = df_invoices_claims.loc[df_invoices_claims['invoice_state'].isin(['ALL_PAID'])

                                          &
                              
                                          df_invoices_claims['claim_state'].isin(['PAID'
                                                                                , 'CANCELLED'
                                                                                , 'NOT_CLAIMABLE'])
                                         ] 

df_total_amounts_grouped = df_total_amounts.groupby("invoice_amount", as_index=False).agg({"funded_amount": "sum"})

# Verify whether an ALL_PAIDED invoice can pay CANCELLED or NOT_CLAIMABLE claims
 # as it returns true, the CANCELLED, NOT_CLAIMABLE will be keeped in the dataSet
df_total_amounts_grouped[df_total_amounts_grouped["invoice_amount"] == df_total_amounts_grouped["funded_amount"]].any().any()

# Group the invoices from their claims
df_invoices_claims_normalised = df_invoices_claims.groupby(["member_id"
                                                          , "claim_id"
                                                          , "claim_state"
                                                          , "invoice_id"
                                                          , "invoice_state"
                                                          , "level3_id"
                                                          , "stated_item_description"
                                                          ,	"level2_id" 
                                                          , "level2_description"
                                                          , "level1_id"
                                                          , "level1_description"
                                                          , "invoice_amount"]).agg({"funded_amount": "sum"}).reset_index()

Name of colums with Nan values: 
['funded_amount']

<class 'pandas.core.frame.DataFrame'>
Int64Index: 251254 entries, 0 to 270151
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   member_id                251254 non-null  int64  
 1   claim_id                 251254 non-null  int64  
 2   funded_amount            251149 non-null  float64
 3   invoice_amount           251254 non-null  float64
 4   level3_id                251254 non-null  int64  
 5   invoice_id               251254 non-null  int64  
 6   stated_item_description  251254 non-null  object 
 7   level2_id                251254 non-null  int64  
 8   claim_state              251254 non-null  object 
 9   invoice_state            251254 non-null  object 
 10  invoice_date             251254 non-null  object 
 11  level2_description       251254 non-null  object 
 12  claimed_date             251254 non-null  object 
 13  level1_

Unnamed: 0,member_id,claim_id,claim_state,invoice_id,invoice_state,level3_id,stated_item_description,level2_id,level2_description,level1_id,level1_description,invoice_amount,funded_amount
0,2,828216,PAID,451938,ALL_PAID,6111,"Assessment, Recommendation, Therapy And/Or Tra...",15,Improved daily living skills,3,Capacity Building,135.00,135.00
1,2,866817,PAID,471149,ALL_PAID,6111,"Assessment, Recommendation, Therapy And/Or Tra...",15,Improved daily living skills,3,Capacity Building,141.00,141.00
2,2,874748,PAID,474710,ALL_PAID,6111,"Assessment, Recommendation, Therapy And/Or Tra...",15,Improved daily living skills,3,Capacity Building,135.00,135.00
3,2,902819,PAID,488447,ALL_PAID,6111,"Assessment, Recommendation, Therapy And/Or Tra...",15,Improved daily living skills,3,Capacity Building,142.50,142.50
4,2,904241,PAID,489078,ALL_PAID,6111,"Assessment, Recommendation, Therapy And/Or Tra...",15,Improved daily living skills,3,Capacity Building,141.00,141.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
250740,6632,989323,PAID,531311,ALL_PAID,6568,Community Participation Activities,9,Increased social and community participation,3,Capacity Building,100.00,100.00
250741,6635,988438,PAID,530736,ALL_PAID,4766,House Cleaning And Other Household Activities,1,Assistance with daily living,1,Core,321.20,321.20
250742,6635,988445,PAID,530743,ALL_PAID,6111,"Assessment, Recommendation, Therapy And/Or Tra...",15,Improved daily living skills,3,Capacity Building,387.98,387.98
250743,6679,994330,PAID,533712,ALL_PAID,4766,House Cleaning And Other Household Activities,1,Assistance with daily living,1,Core,125.50,100.40
