# Prepare Data to be Analyzed with Modulos AutoML

Note: For all of these operations to work, we are relying on the data being sorted, as it's done in the notebook DataCleaning.ipynb.

## Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os
from IPython.display import display
import tqdm
from collections import Counter
import matplotlib
pd.options.display.max_columns = None
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import numpy as np

## Configure path variables and number of samples

In [2]:
# Path where the cleaned data is stored
fpath_clean_data_dir = 'clean_data/'

# Path where the data ready for the ML analysis is stored and filename of output file
fpath_prepared_data_dir = 'ready_data/'
foldername_prepared_data = 'ai_basic_all/'

# Number of unique Cow IDs to consider (the computation is very slow
# Including all samples is only advised once one is happy with the sample)
nsamples = None  # Full length for nsamples = None
# nsamples = 10000

In [3]:
!mkdir -p {fpath_prepared_data_dir}{foldername_prepared_data}

In [4]:
def select_cow_by_id(df, cow_id, id_label='idani_anon'):
    """
    Function to return all the entries matching a specific cow ID.
    
    :param df: Pandas dataframe
    :param cow_id: Specific cow ID to select
    :return: Entries for the corresponding cow ID
    """
    return df[df[id_label]==cow_id]

# Data Loading

Load all relevant tables into one dictionary. Note that we are not considering hm_BCS and hm_pregnancy in this first implementation.

In [5]:
# Columns with datetime entries & file names
datetime_cols = {#'hm_BCS': ['BCS_date'],
                 'hm_lactation': ['calving_date'],
                 'hm_NSAIET': ['nsaiet_date'],
                 'hm_animal': ['birth_date'], 
                 'hm_milkrecording': ['mlksmpl_date', 'lab_date'],
                 'hm_ebv': False,
#                  'hm_pregnancy': ['pregnancy_detection_date'],
                 'hm_health': ['healthevent_date']
                 }

fnames = list(datetime_cols.keys())
print('File names all:', fnames)

File names all: ['hm_lactation', 'hm_NSAIET', 'hm_animal', 'hm_milkrecording', 'hm_ebv', 'hm_health']


In [6]:
data_frames = {}
for fname in fnames:
    print('----- Reading in {:}.csv -----'.format(fname))
    fpath = fpath_clean_data_dir+fname+'.csv'
    data_frames[fname] = pd.read_csv(fpath, parse_dates=datetime_cols[fname])
    print(data_frames[fname].head(10))
    print()

----- Reading in hm_lactation.csv -----
   parity calving_date calving_ease       idani_anon
0       1   2018-09-06            2  CHE000000000561
1       2   2019-09-15            2  CHE000000000561
2       1   2016-09-07            2  CHE000000000781
3       2   2017-08-05            1  CHE000000000781
4       3   2018-10-18          2.5  CHE000000000781
5       4   2019-09-29            1  CHE000000000781
6       1   2017-11-01            1  CHE000000001494
7       2   2018-11-01            2  CHE000000001494
8       3   2020-01-01            2  CHE000000001494
9       1   2013-09-28            1  CHE000000002000

----- Reading in hm_NSAIET.csv -----
   parity nsaiet_date nsaiet_type  AI_technician       idani_anon  \
0       0  2017-11-05    Besamung             10  CHE000000000561   
1       0  2017-11-28    Besamung             10  CHE000000000561   
2       1  2018-11-16    Besamung             10  CHE000000000561   
3       1  2018-12-05    Besamung             10  CHE0000000005

## Data Manipulation & Enhancement

### Remove all parity = 0 entries (i.e. inseminations before the cow has even given birth and milk)

In [7]:
orig_rows = data_frames['hm_NSAIET'].shape[0]
mask = np.argwhere(data_frames['hm_NSAIET']['parity'].values == 0).flatten()
data_frames['hm_NSAIET'] = data_frames['hm_NSAIET'].drop(mask, axis=0).reset_index(drop=True)
print('Removed {:} entries ({:.2f}%)'.format(orig_rows-data_frames['hm_NSAIET'].shape[0],
                                             (1-data_frames['hm_NSAIET'].shape[0]/orig_rows)*100))

Removed 329889 entries (24.15%)


### List of unique cow IDs by considering intersection of all the tables with necessary inputs for prediction

In [8]:
# Tables necessary for the prediction ('hm_health' doesn't contain many cows and
# one would have to throw away much data)
fnames_necessary = [fname for fname in fnames if fname != 'hm_health']

# Select subset
unique_cow_ids = [set(data_frames[fname]['idani_anon'].values) for fname in fnames_necessary]
unique_cow_ids = list(set.intersection(*unique_cow_ids))

print('Number of individual cows in sample: {:}'.format(len(unique_cow_ids)))

Number of individual cows in sample: 180005


### Convert parity to labels (= column used for prediction)
If the same parity number occurs multiple times only the one with the most recent time stamp is considered a success. The other are considered failures. Parities that only appear once are considered success by default.

In [9]:
def parity_to_label_for_single_cow(df):
    """
    Function to return a new column called 'parity_labels', which contains True/False depending on the
    outcome of the artificial insemination.
    
    :param df: Subset of a Pandas dataframe containing all the relevant entries for a single cow
    :return: Column with labels encoding a successful/unsuccessful insemination (1 or 0)
    """

    parity_values = df['parity'].values

    parity_labels = []
    parity_values_seen = []

    for p in parity_values[::-1]:
        if not p in parity_values_seen:
            parity_labels.append(1)
            parity_values_seen.append(p)
        else:
            parity_labels.append(0)

    return parity_labels[::-1]

#### Convert labels for all cows (using unique_cow_ids from above)

In [10]:
ids_to_remove = 0

parity_labels_all = np.zeros(data_frames['hm_NSAIET'].shape[0], dtype=np.int)
for cow_id in tqdm.tqdm(unique_cow_ids):
    left = data_frames['hm_NSAIET']["idani_anon"].searchsorted(cow_id, 'left')
    right = data_frames['hm_NSAIET']["idani_anon"].searchsorted(cow_id, 'right')
    
    single_cow = data_frames['hm_NSAIET'][left:right]
    
    parity_values = single_cow['parity'].values
    if (parity_values != sorted(parity_values)).all():
        unique_cow_ids.remove(cow_id)
        ids_to_remove += 1
        
    else:
        parity_labels_all[left:right] = parity_to_label_for_single_cow(single_cow)
        
data_frames['hm_NSAIET']['parity_labels'] = parity_labels_all

print('Samples removed due to inconsistencies between the parities and the NSAIET-date: {:}'.format(ids_to_remove))

100%|██████████| 180005/180005 [00:33<00:00, 5410.62it/s]

Samples removed due to inconsistencies between the parities and the NSAIET-date: 0





## Display all dataframes individually (sanity check)

In [11]:
data_frames['hm_lactation']

Unnamed: 0,parity,calving_date,calving_ease,idani_anon
0,1,2018-09-06,2,CHE000000000561
1,2,2019-09-15,2,CHE000000000561
2,1,2016-09-07,2,CHE000000000781
3,2,2017-08-05,1,CHE000000000781
4,3,2018-10-18,2.5,CHE000000000781
...,...,...,...,...
637269,1,2013-02-26,2,CHE000099999926
637270,2,2014-03-24,1,CHE000099999926
637271,3,2015-03-05,2,CHE000099999926
637272,4,2016-06-18,3,CHE000099999926


In [12]:
data_frames['hm_NSAIET']

Unnamed: 0,parity,nsaiet_date,nsaiet_type,AI_technician,idani_anon,idani_anon_aisire,parity_labels
0,1,2018-11-16,Besamung,10,CHE000000000561,CHE000002123235,0
1,1,2018-12-05,Besamung,10,CHE000000000561,CHE000002123235,1
2,2,2019-12-09,Besamung,10,CHE000000000561,CHE000001110094,0
3,2,2019-12-10,Besamung,10,CHE000000000561,CHE000001110094,0
4,2,2020-01-31,Besamung,10,CHE000000000561,CHE000090893216,1
...,...,...,...,...,...,...,...
1036244,1,2013-06-13,Besamung,10,CHE000099999926,CHE000025255973,1
1036245,2,2014-05-19,Besamung,10,CHE000099999926,CHE000038915903,1
1036246,3,2015-09-02,Belegung,5,CHE000099999926,CHE000099239508,1
1036247,4,2016-10-14,Besamung,10,CHE000099999926,CHE000045797183,0


In [13]:
data_frames['hm_animal']

Unnamed: 0,birth_date,brd_abbr_icar,idani_anon
0,2016-03-08,HOL,CHE000000000559
1,2016-02-27,HOL,CHE000000000561
2,2011-05-09,HOL,CHE000000000620
3,2014-06-23,HOL,CHE000000000781
4,2015-11-25,HOL,CHE000000001494
...,...,...,...
275766,2014-01-06,HOL,CHE000099998134
275767,2016-11-28,HOL,CHE000099998152
275768,2013-11-12,HOL,CHE000099998376
275769,2015-09-07,HOL,CHE000099999361


In [14]:
data_frames['hm_milkrecording']

Unnamed: 0,mlksmpl_date,milking_time_morning,milking_time_evening,lab_date,DIM,pruefmethode,melkmethode,milk_yield_24h,fat_24h,protein_24h,lactose_24h,scc_24h,urea_24h,AR_PESEE_PESCODEALPAGE,idani_anon,idhrd_anon,milk_yield_msrmt_type,fat_protein_24h_ratio
0,2018-10-10,50000.0,163000.0,2018-10-11,34.0,AT4,Normal,34.2,36.5,24.9,49.8,21.0,2.4,0.0,CHE000000000561,CHE000000095710,2,1.465863
1,2018-11-13,50000.0,163000.0,2018-11-15,68.0,AT4,Normal,34.1,34.9,29.2,50.3,80.0,2.2,0.0,CHE000000000561,CHE000000095710,3,1.195205
2,2018-12-18,50000.0,163000.0,2018-12-19,103.0,AT4,Normal,30.9,37.2,26.7,48.3,132.0,1.6,0.0,CHE000000000561,CHE000000095710,2,1.393258
3,2019-01-21,50000.0,163000.0,2019-01-23,137.0,AT4,Normal,36.3,36.8,31.1,50.0,106.0,2.1,0.0,CHE000000000561,CHE000000095710,3,1.183280
4,2019-02-23,50000.0,163000.0,2019-02-26,170.0,AT4,Normal,35.3,34.2,29.6,48.6,133.0,1.5,0.0,CHE000000000561,CHE000000095710,2,1.155405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5116724,2018-08-06,55000.0,172000.0,2018-08-08,265.0,AT4,Normal,27.9,43.5,32.7,44.1,1004.0,3.3,0.0,CHE000099999926,CHE000000031281,2,1.330275
5116725,2018-09-07,55000.0,171000.0,2018-09-10,297.0,AT4,Normal,30.7,58.8,38.4,44.6,517.0,3.2,0.0,CHE000099999926,CHE000000031281,3,1.531250
5116726,2018-10-10,53000.0,171000.0,2018-10-11,330.0,AT4,Normal,25.3,52.2,40.6,43.5,235.0,2.9,0.0,CHE000099999926,CHE000000031281,2,1.285714
5116727,2018-11-12,53000.0,165000.0,2018-11-14,363.0,AT4,Normal,19.8,52.8,43.3,43.6,166.0,3.2,0.0,CHE000099999926,CHE000000031281,3,1.219400


In [15]:
data_frames['hm_ebv']

Unnamed: 0,base,label,idani_anon,ekg,epr,fkg,fpr,mkg,per,scs
0,HO20,A,CHE000000000559,-23.0,-0.02,-31.0,-0.07,-636.0,93.0,102.0
1,HO20,CH,CHE000000000561,18.0,-0.23,26.0,-0.24,1173.0,93.0,89.0
2,HO20,A,CHE000000000620,5.0,0.11,-1.0,0.05,-147.0,99.0,94.0
3,HO20,CH,CHE000000000781,0.0,-0.07,-2.0,-0.11,179.0,95.0,106.0
4,HO20,CH,CHE000000001494,3.0,0.11,-4.0,0.04,-179.0,93.0,111.0
...,...,...,...,...,...,...,...,...,...,...
271769,HO20,CH,CHE000099998134,6.0,0.11,18.0,0.26,-82.0,89.0,90.0
271770,HO20,CH,CHE000099998152,8.0,-0.03,3.0,-0.11,311.0,89.0,110.0
271771,HO20,CH,CHE000099998376,8.0,0.16,46.0,0.64,-167.0,104.0,99.0
271772,HO20,CH,CHE000099999361,10.0,-0.02,12.0,-0.03,365.0,103.0,111.0


In [16]:
data_frames['hm_health']

Unnamed: 0,hecode_ASR,healthevent_date,idani_anon,idhrd_anon
0,10.7.1.,2018-11-19,CHE000000005877,CHE000000079291
1,2.1.1.,2018-12-20,CHE000000005877,CHE000000079291
2,10.4.,2019-09-06,CHE000000005877,CHE000000079291
3,3.5.,2020-03-14,CHE000000005877,CHE000000079291
4,6.1.,2014-06-02,CHE000000006772,CHE000000055108
...,...,...,...,...
53730,2.1.2.,2019-06-15,CHE000099990201,CHE000000088759
53731,2.5.,2018-01-30,CHE000099990433,CHE000000059284
53732,2.1.1.,2020-03-18,CHE000099992995,CHE000000014814
53733,9.3.99.,2013-11-25,CHE000099998376,CHE000000086408


## Functions to contain hm_NSAIET with other datasets

In [17]:
def combine_nsaeit_with_milkrecording_single_cow(df_nsaiet, df_milkrec, columns_both='idani_anon'):
    """
    Function combining the dataframes hm_NSAIET and hm_milkrecording for a single cow ID.
    The tables are combined such that for every insemination, the date of the previous milkrecording is chosen.
    
    :param df_nsaiet: Subset of the NSAEIT Pandas dataframe containing the relevant entries for a single cow
    :param df_milkrec: Subset of the milkrecording Pandas dataframe containing the relevant entries for a single cow
    :param columns_both: Identical columns in both dataframes
    :return: Merged dataframe
    """
    
    combined_df = []
    for idx_parity, parity_date in enumerate(df_nsaiet['nsaiet_date'].values):        
        # Milk recording dates before the insemination date
        indices = np.argwhere((df_milkrec['mlksmpl_date'].values < parity_date)==True).flatten()
        
        # Throw away values, where there is no milk recording date before the insemination date
        if indices.size == 0:
            continue
            
        idx_milkrec = np.argwhere((df_milkrec['mlksmpl_date'].values < parity_date)==True).flatten()[-1]
        
        # Throw away the value, if the difference between the last milk recording and
        # the artificial insemination is longer than 60 days
        delta = np.timedelta64(parity_date - df_milkrec['mlksmpl_date'].values[idx_milkrec], 'D') // np.timedelta64(1, 'D')
        if delta > 60:
            continue
        
        df = pd.merge(df_nsaiet.iloc[[idx_parity]],
                      df_milkrec.iloc[[idx_milkrec]],
                      "inner", on=columns_both)
        combined_df.append(df)
    
    # Return None for an emtpy dataframe
    if len(combined_df) == 0:
        return None

    return pd.concat(combined_df).reset_index(drop=True)


def combine_nsaeit_with_lactation_single_cow(df_nsaiet, df_lactation, columns_both='idani_anon'):
    """
    Function combining the dataframes hm_NSAIET and hm_lactation for a single cow ID.
    The tables are combined such that for every insemination, the entry with the same parity is chosen.
    
    :param df_nsaiet: Subset of the NSAEIT Pandas dataframe containing the relevant entries for a single cow
    :param df_lactation: Subset of the lactation Pandas dataframe containing the relevant entries for a single cow
    :param columns_both: Identical columns in both dataframes
    :return: Merged dataframe
    """
    
    combined_df = []
    for idx_parity, parity in enumerate(df_nsaiet['parity'].values):
        idx_lactation = np.argwhere((df_lactation['parity'].values == parity)).flatten()[0]
                
        df = pd.merge(df_nsaiet.iloc[[idx_parity]],
                      df_lactation.iloc[[idx_lactation]],
                      "inner", on=columns_both)
        combined_df.append(df)

    return pd.concat(combined_df).reset_index(drop=True)


def combine_with_health_single_cow(df_nsaiet, df_health, threshold_health_date=45):
    """
    Add health-events related columns depending on whether there were any recorded health events XX days before the
    insemination date.
    
    :param df_nsaiet: Subset of the NSAEIT Pandas dataframe containing all the relevant entries for a single cow
    :param df_other: Subset of the health Pandas dataframe containing all the relevant entries for a single cow
    :param threshold_health_date: Number of days before the insemination that a health event is considered to be relevant 
    :return: Column with number of health events XX days before the artificial insemination
    """
    
    healthevents = np.zeros(df_nsaiet.shape[0], dtype=np.float)

    if df_health is not None:  
        health_dates = df_health['healthevent_date'].values

        for idx_parity, parity_date in enumerate(df_nsaiet['nsaiet_date'].values):
            deltas = [np.timedelta64(parity_date-date_health, 'D') // np.timedelta64(1, 'D') for date_health in health_dates]
            deltas = np.array(deltas, dtype=np.float)
            healthevents[idx_parity] = np.sum((deltas <= threshold_health_date) & (deltas >= 0))

    return healthevents


def combine_with_other_datasets_single_cow(df_nsaiet, df_other, columns_both='idani_anon'):
    """
    Function combining the dataframes hm_NSAIET and hm_milkrecording (already combined) with another dataframe
    for a single cow ID.
    
    :param df_nsaiet: Subset of the NSAEIT Pandas dataframe containing all the relevant entries for a single cow
    :param df_other: Subset of the other Pandas dataframe containing all the relevant entries for a single cow
    :param columns_both: Identical columns in both dataframes
    :return: Merged dataframe
    """
    
    combined_df = []
    for idx_parity, parity_date in enumerate(df_nsaiet['nsaiet_date'].values):
        df = pd.merge(df_nsaiet.iloc[[idx_parity]],
                      df_other.iloc[[0]],
                      "inner", on=columns_both)
        combined_df.append(df)

    return pd.concat(combined_df).reset_index(drop=True)


def return_single_cow_subset(df, cow_id, identifier_col='idani_anon'):
    """
    For a given dataframe, return the subset of the dataframe for a given cow_id.
    
    :param df: Pandas dataframe
    :param cow_id: ID of the cow, whose data is to be selected
    :param identifier_col: Name of the column containing the ID
    :return: Subset of Pandas dataframe
    """
    
    left = df[identifier_col].searchsorted(cow_id, 'left')
    right = df[identifier_col].searchsorted(cow_id, 'right')
    return df[left:right]

## Merge all dataframes

In [18]:
datetime_cols = {#'hm_BCS': ['BCS_date'],
                 'hm_lactation': ['calving_date'],
                 'hm_NSAIET': ['nsaiet_date'],
                 'hm_animal': ['birth_date'], 
                 'hm_milkrecording': ['mlksmpl_date', 'lab_date'],
                 'hm_ebv': False,
#                  'hm_pregnancy': ['pregnancy_detection_date'],
                 'hm_health': ['healthevent_date']
                 }

fnames = list(datetime_cols.keys())

fnames_wo_nsaiet_milkrec = [fname for fname in fnames if (fname != 'hm_NSAIET') and (fname != 'hm_milkrecording')]

In [19]:
df_merged = []

counter = 0
bunchsize = 500

for cow_id in tqdm.tqdm(unique_cow_ids[:nsamples]):
    # Merge NSAIET & milkrecording
    single_cow_nsaiet = return_single_cow_subset(data_frames['hm_NSAIET'], cow_id)
    single_cow_milkrecording = return_single_cow_subset(data_frames['hm_milkrecording'], cow_id)
    
    dfcomb = combine_nsaeit_with_milkrecording_single_cow(single_cow_nsaiet, single_cow_milkrecording)
    
    # Skip Cow ID in case of an emtpy dataframe
    if dfcomb is None:
        continue
    
    for fname in fnames_wo_nsaiet_milkrec:
        single_cow = return_single_cow_subset(data_frames[fname], cow_id)
        col_both = list(set.intersection(set(dfcomb.keys().values), set(data_frames[fname].keys().values)))

        # Combine with hm_lactation
        if fname == 'hm_lactation':
            dfcomb = combine_nsaeit_with_lactation_single_cow(dfcomb, single_cow, col_both)

        # Combine with hm_ebv
        if fname == 'hm_ebv':
            dfcomb = combine_with_other_datasets_single_cow(dfcomb, single_cow, col_both)

        # Combine with hm_animal
        if fname == 'hm_animal':
            dfcomb = combine_with_other_datasets_single_cow(dfcomb, single_cow, col_both)

        # Combine with hm_health
        if fname == 'hm_health':
            dfcomb['healthevents'] = combine_with_health_single_cow(dfcomb, single_cow, threshold_health_date=45)
            
    # Append dataframes
    df_merged.append(dfcomb)
    
    counter += 1

    # Concatenate all dataframe and reset list after N=bunchsize samples
    if counter == bunchsize:
        df_merged_all = pd.concat(df_merged).reset_index(drop=True)
        df_merged = []
    elif counter % bunchsize == 0:
        df_merged_all = df_merged_all.append(pd.concat(df_merged).reset_index(drop=True), ignore_index=True)
        df_merged = []

if counter < bunchsize:
    df_merged_all = pd.concat(df_merged).reset_index(drop=True)
elif len(df_merged) != 0:
    df_merged_all = df_merged_all.append(pd.concat(df_merged).reset_index(drop=True), ignore_index=True) 

  0%|          | 341/180005 [00:52<7:37:04,  6.55it/s] 


KeyboardInterrupt: 

## Add columns with age and days since calving, drop datetime columns

In [20]:
# Add columns (deltas between dates)
df_merged_all['age'] = (df_merged_all['nsaiet_date'] - df_merged_all['birth_date']).values // np.timedelta64(1, 'D')
df_merged_all['days_since_calving'] = (df_merged_all['nsaiet_date'] - df_merged_all['calving_date']).values // np.timedelta64(1, 'D')
df_merged_all['days_since_mlksample'] = (df_merged_all['nsaiet_date'] - df_merged_all['mlksmpl_date']).values // np.timedelta64(1, 'D')

# # Drop columns with datetimes, since only the deltas are relevant
# columns_to_drop = ['nsaiet_date', 'birth_date', 'calving_date', 'mlksmpl_date', 'lab_date', 'birth_date']
# df_merged_all = df_merged_all.drop(labels=columns_to_drop, axis=1)

NameError: name 'df_merged_all' is not defined

In [21]:
df_merged_all

NameError: name 'df_merged_all' is not defined

## Save data, create a dataset structure file for the AutoML platform, and tar the dataset

Save dataset

In [22]:
folderpath = fpath_prepared_data_dir + foldername_prepared_data
df_merged_all.to_csv(folderpath+'data.csv', index=False)

NameError: name 'df_merged_all' is not defined

Save dataset structure file (DSSF), which is needed for the AutoML analysis

In [23]:
# Content of DSSF
dssf_string = ['[',
               '    {',
               '        \"name\": \"{}\",'.format(foldername_prepared_data[:-1]),
               '        \"path\": \"data.csv\",',
               '        \"type\": \"table\"',
               '    },',
               '    {',
               '        \"_version\": \"0.1\"',
               '    }',
               ']'
              ]

print('\n'.join(dssf_string))

# Write DSSF
text_file = open(folderpath+'dataset_structure.json', 'w')
n = text_file.write('\n'.join(dssf_string))
text_file.close()

[
    {
        "name": "ai_basic_all",
        "path": "data.csv",
        "type": "table"
    },
    {
        "_version": "0.1"
    }
]


Create a tarball of all the contents

In [24]:
!tar -cf {fpath_prepared_data_dir}{foldername_prepared_data[:-1]}.tar -C {fpath_prepared_data_dir} {foldername_prepared_data[:-1]}

## Prepare a file for a regression task (predict optimal date for insemination)

In [25]:
foldername_prepared_data = 'ai_basic_all_predict_date/'

In [26]:
!mkdir -p {fpath_prepared_data_dir}{foldername_prepared_data}

In [27]:
# Remove all non-successful inseminations
mask = df_merged_all['parity_labels'].values == 0
df_merged_subset = df_merged_all.drop(np.arange(mask.size)[mask], axis=0).reset_index(drop=True)

folderpath = fpath_prepared_data_dir + foldername_prepared_data
df_merged_subset.to_csv(folderpath+'data.csv', index=False)

NameError: name 'df_merged_all' is not defined

In [28]:
# Content of DSSF
dssf_string = ['[',
               '    {',
               '        \"name\": \"{}\",'.format(foldername_prepared_data[:-1]),
               '        \"path\": \"data.csv\",',
               '        \"type\": \"table\"',
               '    },',
               '    {',
               '        \"_version\": \"0.1\"',
               '    }',
               ']'
              ]

print('\n'.join(dssf_string))

# Write DSSF
text_file = open(folderpath+'dataset_structure.json', 'w')
n = text_file.write('\n'.join(dssf_string))
text_file.close()

[
    {
        "name": "ai_basic_all_predict_date",
        "path": "data.csv",
        "type": "table"
    },
    {
        "_version": "0.1"
    }
]


In [29]:
!tar -cf {fpath_prepared_data_dir}{foldername_prepared_data[:-1]}.tar -C {fpath_prepared_data_dir} {foldername_prepared_data[:-1]}