In [1]:
# -------------------------- LIBRARIES NECESSARY IN THIS PROJECT  -------------------------- #

import pandas as pd
import numpy as np
import seaborn as sns
from datetime import *
import zipfile
import pylab as plt

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# -------------------------- FUNCTIONS -------------------------- #

def getDataSet(data_path):
    return pd.read_csv(data_path)

def plotNaNValuesColums(dataSet):
    # Gets the dataSet Columns with null values
    #  and the Percentual of null values per column    
    missing_values = (dataSet.isnull().sum() / len(dataSet)) * 100    
    missing_values = missing_values[missing_values > 0]
    missing_values.sort_values(inplace=True)   

    missing_values = missing_values.to_frame()
    missing_values.columns = ['Percentual']
    missing_values.index.names = ['Name']
    missing_values['Column'] = missing_values.index

    # Plots the graph
    sns.set(style="whitegrid", color_codes=True)
    sns.barplot(x = 'Column', y = 'Percentual', data=missing_values)
    plt.xticks(rotation = 90)
    plt.show()
    print(missing_values)
    
def plotNotNaNValuesColums(dataSet):
    # Gets the dataSet Columns with null values
    #  and the Percentual of null values per column    
    missing_values = (dataSet.notnull().sum() / len(dataSet)) * 100    
    missing_values = missing_values[missing_values > 0]
    missing_values.sort_values(inplace=True)   

    missing_values = missing_values.to_frame()
    missing_values.columns = ['Percentual']
    missing_values.index.names = ['Name']
    missing_values['Column'] = missing_values.index

    # Plots the graph
    sns.set(style="whitegrid", color_codes=True)
    sns.barplot(x = 'Column', y = 'Percentual', data=missing_values)
    plt.xticks(rotation = 90)
    plt.show()
    print(missing_values)

## Upload dataSets

### Members dataset cleasing

In [2]:
# -------------------------- PATH OF THE DATASETS USED IN THE PROJECT  -------------------------- #
zf = zipfile.ZipFile('../dataSets/main.zip')

main_data_set_all_fields = pd.read_csv(zf.open('main.csv'))

## Data manipulation and cleansing

In [3]:
# Check the dataSet fields
main_data_set_all_fields.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32950 entries, 0 to 32949
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              32950 non-null  int64  
 1   member_key              32950 non-null  object 
 2   updated_at              32950 non-null  object 
 3   first_name              32950 non-null  object 
 4   last_name               32950 non-null  object 
 5   disabilities            8383 non-null   object 
 6   SA1                     3751 non-null   float64
 7   price_zone_code         32950 non-null  object 
 8   plan_key                32950 non-null  object 
 9   plan_status             32950 non-null  object 
 10  plan_start_date         32950 non-null  object 
 11  plan_end_date           32950 non-null  object 
 12  budget_level3_name      840 non-null    object 
 13  budget_level3_key       840 non-null    object 
 14  budget_level2_key       32950 non-null

In [4]:
main_data_set_all_fields.head(4)

Unnamed: 0.1,Unnamed: 0,member_key,updated_at,first_name,last_name,disabilities,SA1,price_zone_code,plan_key,plan_status,...,budget_level1_key,budget_level1_name,invoice_state,claim_state,allocated_amount,requested_amount,funded_amount,allocated_amount_total,requested_amount_total,funded_amount_total
0,0,00109970-7029-11eb-81d6-9d4df94b6224,2021-04,Wynny,Pressman,,,ACT_NSW_QLD_VIC,588427f1-e45d-4a6e-b307-9c68a13849ef,PLAN_DELIVERY_ACTIVE,...,58745466-f597-11e9-bfb4-022d4762bb3c,Core,ALL_PAID,PAID,50000.0,98.8,98.8,272168.34,11316.7,11316.7
1,1,00109970-7029-11eb-81d6-9d4df94b6224,2021-04,Wynny,Pressman,,,ACT_NSW_QLD_VIC,588427f1-e45d-4a6e-b307-9c68a13849ef,PLAN_DELIVERY_ACTIVE,...,58745466-f597-11e9-bfb4-022d4762bb3c,Core,ALL_PAID,PAID,222168.34,11217.9,11217.9,272168.34,11316.7,11316.7
2,2,0010f240-3b5d-11eb-aa73-7be7bbd3c758,2021-04,Beryl,McDuff,Asperger syndrome,3138648.0,ACT_NSW_QLD_VIC,ef286fe8-8b36-4d77-97a0-59f9ef3df9b8,PLAN_DELIVERY_ACTIVE,...,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,ALL_PAID,PAID,9801.6,2559.6384,2559.67,9801.6,2559.6384,2559.67
3,3,00117a50-cbc6-11ea-805a-75fc51d39a77,2021-04,Jethro,Perchard,,,ACT_NSW_QLD_VIC,8aa26238-d922-4963-bcaf-427d24a610bc,COMPLETED,...,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,ALL_PAID,PAID,64524.0,8503.65,8503.65,64524.0,8503.65,8503.65


In [5]:
main_data_No_NaN = main_data_set_all_fields.replace(np.nan, 'NA')

main_data_No_NaN = main_data_No_NaN[(main_data_No_NaN["plan_status"] == 'PLAN_DELIVERY_ACTIVE')]

# main_data_No_NaN['updated_at'] = pd.to_datetime(main_data_No_NaN['updated_at']).dt.strftime('%Y-%m')

In [6]:
# Select just the fields to be used
main_data_set = main_data_No_NaN.groupby([
                                           "member_key"
                                         , "first_name"
                                         , "last_name"
                                         , "budget_level1_key"
                                         , "budget_level1_name"
                                         , "plan_start_date"
                                         , "plan_end_date"
                                         , "allocated_amount_total"
                                        ]).agg({"funded_amount": "sum", "allocated_amount": "sum"}).reset_index().copy()



main_data_set

Unnamed: 0,member_key,first_name,last_name,budget_level1_key,budget_level1_name,plan_start_date,plan_end_date,allocated_amount_total,funded_amount,allocated_amount
0,00109970-7029-11eb-81d6-9d4df94b6224,Wynny,Pressman,58745466-f597-11e9-bfb4-022d4762bb3c,Core,2021-02,2024-02,272168.34,11316.70,272168.34
1,0010f240-3b5d-11eb-aa73-7be7bbd3c758,Beryl,McDuff,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,2020-09,2021-09,9801.60,2559.67,9801.60
2,0011ebe0-7722-11eb-9ade-8f3368e47951,Eamon,Gammage,58745466-f597-11e9-bfb4-022d4762bb3c,Core,2021-02,2024-02,312025.20,3629.66,294000.00
3,0011ebe0-7722-11eb-9ade-8f3368e47951,Eamon,Gammage,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,2021-02,2024-02,312025.20,691.27,18025.20
4,007c1090-d6ca-11ea-bb8a-a73bf14c75f9,Janean,Potzold,58745466-f597-11e9-bfb4-022d4762bb3c,Core,2020-08,2021-08,155148.97,4213.97,73368.83
...,...,...,...,...,...,...,...,...,...,...
9266,ff630c80-d094-11ea-847f-196f66acbe45,Mureil,Labb,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,2020-06,2021-06,19623.84,2069.29,19002.84
9267,ff656e00-7fce-11eb-8e57-d94003868067,Jeramie,Burgot,58745466-f597-11e9-bfb4-022d4762bb3c,Core,2021-03,2022-03,48587.66,204.36,3000.00
9268,ff656e00-7fce-11eb-8e57-d94003868067,Jeramie,Burgot,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,2021-03,2022-03,48587.66,750.25,45587.66
9269,ffddfbc0-0861-11eb-92a0-2d24c44fa781,Allard,Postle,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,2020-09,2021-09,18817.03,5508.97,18817.03


In [7]:
#months_total = main_data_set[{
#                              "member_key"
#                            , "plan_start_date"
#                            , "plan_end_date"}].groupby([
#                                                        "member_key"
#                                                      , "plan_start_date"
#                                                      , "plan_end_date"
#                                                        ]).count().reset_index()

months_total = main_data_set[{
                              "member_key"
                            , "plan_start_date"
                            , "plan_end_date"}].copy()

months_total['plan_months_total'] = ((pd.to_datetime(months_total['plan_end_date']).dt.year
                                   -  pd.to_datetime(months_total['plan_start_date']).dt.year) * 12) + (pd.to_datetime(months_total['plan_end_date']).dt.month
                                   -  pd.to_datetime(months_total['plan_start_date']).dt.month)

months_total_merged = months_total[{"member_key", "plan_months_total"}]
                            
months_total_merged = months_total[{
                                    "member_key"
                                  , "plan_months_total"
                            }].groupby([
                                       "member_key"
                                     , "plan_months_total"
                                       ]).count().reset_index()

months_total_merged = pd.merge(main_data_set, months_total_merged, how="left", on=["member_key"])

months_total_merged = months_total_merged.loc[months_total_merged["plan_months_total"] >= 12]

months_total_merged

Unnamed: 0,member_key,first_name,last_name,budget_level1_key,budget_level1_name,plan_start_date,plan_end_date,allocated_amount_total,funded_amount,allocated_amount,plan_months_total
0,00109970-7029-11eb-81d6-9d4df94b6224,Wynny,Pressman,58745466-f597-11e9-bfb4-022d4762bb3c,Core,2021-02,2024-02,272168.34,11316.70,272168.34,36
1,0010f240-3b5d-11eb-aa73-7be7bbd3c758,Beryl,McDuff,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,2020-09,2021-09,9801.60,2559.67,9801.60,12
2,0011ebe0-7722-11eb-9ade-8f3368e47951,Eamon,Gammage,58745466-f597-11e9-bfb4-022d4762bb3c,Core,2021-02,2024-02,312025.20,3629.66,294000.00,36
3,0011ebe0-7722-11eb-9ade-8f3368e47951,Eamon,Gammage,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,2021-02,2024-02,312025.20,691.27,18025.20,36
4,007c1090-d6ca-11ea-bb8a-a73bf14c75f9,Janean,Potzold,58745466-f597-11e9-bfb4-022d4762bb3c,Core,2020-08,2021-08,155148.97,4213.97,73368.83,12
...,...,...,...,...,...,...,...,...,...,...,...
9266,ff630c80-d094-11ea-847f-196f66acbe45,Mureil,Labb,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,2020-06,2021-06,19623.84,2069.29,19002.84,12
9267,ff656e00-7fce-11eb-8e57-d94003868067,Jeramie,Burgot,58745466-f597-11e9-bfb4-022d4762bb3c,Core,2021-03,2022-03,48587.66,204.36,3000.00,12
9268,ff656e00-7fce-11eb-8e57-d94003868067,Jeramie,Burgot,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,2021-03,2022-03,48587.66,750.25,45587.66,12
9269,ffddfbc0-0861-11eb-92a0-2d24c44fa781,Allard,Postle,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,2020-09,2021-09,18817.03,5508.97,18817.03,12


In [8]:
allocated_amount_month = months_total_merged[{
                                              "member_key"
                                            , "budget_level1_key"
                                            , "plan_months_total"
                                            , "allocated_amount"
                                            }].copy()

allocated_amount_month["allocated_amount_month"] = allocated_amount_month["allocated_amount"] / allocated_amount_month["plan_months_total"]                                                    

allocated_amount_month = allocated_amount_month[{'member_key', 'budget_level1_key', 'allocated_amount_month'}]

months_total_merged_monthly = pd.merge(months_total_merged, allocated_amount_month, how="left", on=["member_key", "budget_level1_key"])

months_total_merged_monthly

Unnamed: 0,member_key,first_name,last_name,budget_level1_key,budget_level1_name,plan_start_date,plan_end_date,allocated_amount_total,funded_amount,allocated_amount,plan_months_total,allocated_amount_month
0,00109970-7029-11eb-81d6-9d4df94b6224,Wynny,Pressman,58745466-f597-11e9-bfb4-022d4762bb3c,Core,2021-02,2024-02,272168.34,11316.70,272168.34,36,7560.231667
1,0010f240-3b5d-11eb-aa73-7be7bbd3c758,Beryl,McDuff,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,2020-09,2021-09,9801.60,2559.67,9801.60,12,816.800000
2,0011ebe0-7722-11eb-9ade-8f3368e47951,Eamon,Gammage,58745466-f597-11e9-bfb4-022d4762bb3c,Core,2021-02,2024-02,312025.20,3629.66,294000.00,36,8166.666667
3,0011ebe0-7722-11eb-9ade-8f3368e47951,Eamon,Gammage,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,2021-02,2024-02,312025.20,691.27,18025.20,36,500.700000
4,007c1090-d6ca-11ea-bb8a-a73bf14c75f9,Janean,Potzold,58745466-f597-11e9-bfb4-022d4762bb3c,Core,2020-08,2021-08,155148.97,4213.97,73368.83,12,6114.069167
...,...,...,...,...,...,...,...,...,...,...,...,...
8870,ff630c80-d094-11ea-847f-196f66acbe45,Mureil,Labb,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,2020-06,2021-06,19623.84,2069.29,19002.84,12,1583.570000
8871,ff656e00-7fce-11eb-8e57-d94003868067,Jeramie,Burgot,58745466-f597-11e9-bfb4-022d4762bb3c,Core,2021-03,2022-03,48587.66,204.36,3000.00,12,250.000000
8872,ff656e00-7fce-11eb-8e57-d94003868067,Jeramie,Burgot,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,2021-03,2022-03,48587.66,750.25,45587.66,12,3798.971667
8873,ffddfbc0-0861-11eb-92a0-2d24c44fa781,Allard,Postle,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,2020-09,2021-09,18817.03,5508.97,18817.03,12,1568.085833


In [9]:
#allocated_amount_month = months_total_merged.groupby([
#                                                        "member_key"
#                                                      , "budget_level1_key"
#                                                      , "plan_months_total"
#                                                        ]).agg({"allocated_amount": "sum"}).reset_index()

#allocated_amount_month["allocated_amount_month"] = allocated_amount_month["allocated_amount"] / allocated_amount_month["plan_months_total"]                                                    
#allocated_amount_month = allocated_amount_month[{'member_key', 'budget_level1_key', 'allocated_amount_month'}]

#months_total_merged_monthly = pd.merge(months_total_merged, allocated_amount_month, how="left", on=["member_key", "budget_level1_key"])

#months_total_merged_monthly

In [10]:
#month_today = main_data_set[{
#                           "member_key"
#                         , "budget_level1_key"
#                         , "plan_start_date"
#                            }].groupby([
#                                       "member_key"
#                                     , "budget_level1_key"
#                                     , "plan_start_date"
#                                       ]).count().reset_index()


month_today = main_data_set[{
                           "member_key"
                         , "budget_level1_key"
                         , "plan_start_date"
                            }].copy()


month_today['month_actual'] = ((pd.to_datetime("today").year -  pd.to_datetime(month_today['plan_start_date']).dt.year) * 12)                                                                                                                     + (pd.to_datetime("today").month - pd.to_datetime(month_today['plan_start_date']).dt.month)

month_today = month_today[{'member_key', 'budget_level1_key', 'month_actual'}]

month_actual = pd.merge(months_total_merged_monthly, month_today, how="left", on=["member_key", "budget_level1_key"])

month_actual["spent_amount_predicted"] = month_actual["allocated_amount_month"] * month_actual["month_actual"]

In [11]:
#month_actual['spent_amount_predicted_percentage'] = (month_actual['spent_amount_predicted'] / month_actual['allocated_amount']) * 100
#month_actual['funded_amount_percentage'] = (month_actual['funded_amount'] / month_actual['allocated_amount']) * 100

In [12]:
#month_actual['months_missing'] = month_actual['plan_months_total'] - month_actual['month_actual']

month_actual['funded_amount'] = month_actual['funded_amount'].round(2)
month_actual['allocated_amount'] = month_actual['allocated_amount'].round(2)

month_actual['allocated_amount_month'] = month_actual['allocated_amount_month'].round(2)
month_actual['spent_amount_predicted'] = month_actual['spent_amount_predicted'].round(2)


#month_actual['spent_amount_predicted_percentage'] = month_actual['spent_amount_predicted_percentage'].round(2)
#month_actual['funded_amount_percentage'] = month_actual['funded_amount_percentage'].round(2)

In [13]:
main_data_set = month_actual[{
                              "member_key"
                            , "first_name" 
                            , "last_name" 
                            , "budget_level1_key"
                            , "budget_level1_name"
                            , "plan_months_total"
                            , "month_actual"
                            , "allocated_amount_total"
                            , "funded_amount"
                            , "plan_months_total"
                            , "spent_amount_predicted"
                            }].copy()

#main_data_set = month_actual.groupby([
#                                       "member_key"
#                                      , "first_name" 
#                                      , "last_name" 
#                                      , "budget_level1_key"
#                                      , "budget_level1_name"
#                                      , "plan_months_total"
#                                      , "months_missing"
#                                      ]).agg({
#                                                #"funded_amount": "sum"
#                                                 "spent_amount_predicted_percentage": "sum"
#                                               , "funded_amount_percentage": "sum"
#                                            }).reset_index()

main_data_set.sort_values(by=['member_key', 'budget_level1_key'], inplace=True)

main_data_set

#main_data_set.loc[main_data_set['first_name'] == 'Janean']

Unnamed: 0,allocated_amount_total,member_key,budget_level1_name,month_actual,funded_amount,first_name,spent_amount_predicted,budget_level1_key,last_name,plan_months_total
0,272168.34,00109970-7029-11eb-81d6-9d4df94b6224,Core,3,11316.70,Wynny,22680.69,58745466-f597-11e9-bfb4-022d4762bb3c,Pressman,36
1,9801.60,0010f240-3b5d-11eb-aa73-7be7bbd3c758,Capacity Building,8,2559.67,Beryl,6534.40,5874551b-f597-11e9-bfb4-022d4762bb3c,McDuff,12
2,312025.20,0011ebe0-7722-11eb-9ade-8f3368e47951,Core,3,3629.66,Eamon,24500.00,58745466-f597-11e9-bfb4-022d4762bb3c,Gammage,36
3,312025.20,0011ebe0-7722-11eb-9ade-8f3368e47951,Capacity Building,3,691.27,Eamon,1502.10,5874551b-f597-11e9-bfb4-022d4762bb3c,Gammage,36
4,155148.97,007c1090-d6ca-11ea-bb8a-a73bf14c75f9,Core,9,4213.97,Janean,55026.62,58745466-f597-11e9-bfb4-022d4762bb3c,Potzold,12
...,...,...,...,...,...,...,...,...,...,...
8870,19623.84,ff630c80-d094-11ea-847f-196f66acbe45,Capacity Building,11,2069.29,Mureil,17419.27,5874551b-f597-11e9-bfb4-022d4762bb3c,Labb,12
8871,48587.66,ff656e00-7fce-11eb-8e57-d94003868067,Core,2,204.36,Jeramie,500.00,58745466-f597-11e9-bfb4-022d4762bb3c,Burgot,12
8872,48587.66,ff656e00-7fce-11eb-8e57-d94003868067,Capacity Building,2,750.25,Jeramie,7597.94,5874551b-f597-11e9-bfb4-022d4762bb3c,Burgot,12
8873,18817.03,ffddfbc0-0861-11eb-92a0-2d24c44fa781,Capacity Building,8,5508.97,Allard,12544.69,5874551b-f597-11e9-bfb4-022d4762bb3c,Postle,12


In [14]:
#main_data_set['spending_status'] = np.where((main_data_set['funded_amount_percentage'] < main_data_set['spent_amount_predicted_percentage']), 0, 1)
#main_data_set['spending_status'] = np.where((main_data_set['funded_amount_percentage'] == main_data_set['spent_amount_predicted_percentage']), 2, main_data_set['spending_status'])

main_data_set['spending_status'] = np.where((main_data_set['funded_amount'] < main_data_set['spent_amount_predicted']), 'Underspending', 'Overspending')
#main_data_set['spending_status'] = np.where((main_data_set['funded_amount'] == main_data_set['spent_amount_predicted']), 2, main_data_set['spending_status'])

In [15]:
df_model = main_data_set[{'funded_amount'
                        , 'spent_amount_predicted'
                        , 'month_actual'
                        , 'plan_months_total'
                        , 'spending_status'}].copy()

In [16]:
#df_model = main_data_set[{'spent_amount_predicted_percentage', 'funded_amount_percentage', 'spending_status'}].loc[
#                                                                                                                  (main_data_set['spent_amount_predicted_percentage'] >0)
#                                                                                                                & (main_data_set['funded_amount_percentage'] >0)

#                                                                                                                & (main_data_set['spending_status'] != 2)

#                                                                                                                  ]

In [17]:
df_model.head()

Unnamed: 0,spending_status,funded_amount,month_actual,spent_amount_predicted,plan_months_total
0,Underspending,11316.7,3,22680.69,36
1,Underspending,2559.67,8,6534.4,12
2,Underspending,3629.66,3,24500.0,36
3,Underspending,691.27,3,1502.1,36
4,Underspending,4213.97,9,55026.62,12


In [18]:
#df_model.plot(x='spent_amount_predicted_percentage', y='funded_amount_percentage', style='o')
#plt.title('Hours vs Percentage')
#plt.xlabel('Hours Studied')
#plt.ylabel('Percentage Score')
#plt.show()

In [19]:
x = df_model[{'funded_amount', 'spent_amount_predicted', 'month_actual', 'plan_months_total'}]
y = df_model['spending_status']

normalized_range = sklearn.preprocessing.MinMaxScaler(feature_range=(-1,1))
x = normalized_range.fit_transform(x)

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [21]:
regression = LogisticRegression(random_state=0, multi_class='multinomial', penalty='none', solver='newton-cg', max_iter = 200, warm_start = False).fit(x_train, y_train)

regression.fit(x_train, y_train)

In [None]:
prediction_score = regression.score(x_test, y_test)
print('Prediction score: ', prediction_score * 100)

In [None]:
print(regression.intercept_)

In [None]:
print(regression.coef_)

In [None]:
y_pred = regression.predict(x_test)

In [None]:
confusion_matrix(y_test, y_pred)
#confmtrx = np.array(confusion_matrix)

#confmtrx

In [None]:
#df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
#df.head(20)

#df.loc[ (df['Actual'] == 1) & (df['Predicted'] == 1)]

In [None]:
model_to_be_predicted = main_data_set_all_fields

for index in range(len(model_to_be_predicted)):
    member_key = ' member_key: ' + model_to_be_predicted['member_key'].iloc[index] + ' First name: ' + model_to_be_predicted['first_name'].iloc[index] + ' in Level 1 == ' + model_to_be_predicted['budget_level1_name'].iloc[index]

    funded_amount_percentage = model_to_be_predicted['funded_amount_percentage'].iloc[index]
    spent_percentage = model_to_be_predicted['spent_amount_predicted_percentage'].iloc[index]

    test = logisticRegression.predict((np.array([spent_percentage, funded_amount_percentage]).reshape(1, -1)))
    if test == 0:  
        print(member_key + ' -------------- UNDERSPENDING')
    else:
        print(member_key + ' -------------- OVERSPENDING')

In [None]:
#compression_opts = dict(method='zip',
#                        archive_name='prediction.csv')

#main_data_set.to_csv('../dataSets/prediction.zip', compression=compression_opts)