In [None]:
import logging
from io import BytesIO

import boto3 
from botocore import UNSIGNED
from botocore.client import Config
import pandas as pd

from copulas.multivariate.GaussianCopula import GaussianCopula

LOGGER = logging.getLogger(__name__)

In [None]:
BUCKET = 'copulas-data-store'
REGION_NAME = 'us-east-1'
FILE_NAME = 'glass_1_train.csv'

In [None]:
def get_resources():
    return boto3.resource(
        's3',
        region_name='us-east-1',
        config=Config(signature_version=UNSIGNED)
    )

In [None]:
def clean_dataset(data):
    numerical_col=[]
    
    for column in data.columns:
        if (data[column].astype(int) == data[column]).all():
            numerical_col.append(column)

    data.drop(data.columns[numerical_col], axis=1, inplace=True)
    data.columns = range(data.shape[1])
            
    return data

In [None]:
def get_dataset():
    resources = get_resources()
    bucket = resources.Bucket(BUCKET)

    key_data = FILE_NAME
    obj = bucket.Object(key=key_data)

    stream = BytesIO(obj.get()['Body'].read())
    data = pd.read_csv(stream, header=None)
    data = clean_dataset(data)

    return data

In [None]:
copula = GaussianCopula()
data = get_dataset()

copula.fit(data)
print(copula)

In [None]:
copula.sample(10)