In [None]:
# default_exp data

# Data

> Class representing the entire table (the entire arff or csv file).


Data consists of table name, attributes (column names and their possible values) and instances (single records).

**Example:**

"@relation weather.symbolic
<br> @attribute outlook {sunny, overcast, rainy}
 
 @data
<br> sunny[0.5];rainy[0.3];overcast[0.2]"
 
Data (table) named "weather.symbolic" with one attribute (column) "outlook" (with three possible values) and one instance (record) "sunny[0.5];rainy[0.3];overcast[0.2]".

In [None]:
# export
from io import TextIOWrapper, StringIO
import traceback
import re
import pandas as pd
from pandas import DataFrame
from typing import List, Set

from pyuid3.reading import Reading
from pyuid3.instance import Instance
from pyuid3.att_stats import AttStats
from pyuid3.attribute import Attribute

In [None]:
# export
class Data:
    REAL_DOMAIN = '@REAL'

    def __init__(self, name: str = None, attributes: List[Attribute] = None, instances: List[Instance] = None):
        self.name = name
        self.attributes = attributes
        self.instances = instances
        
    def __len__(self):
        return len(self.instances)

    def filter_nominal_attribute_value(self, at: Attribute, value: str) -> 'Data':
        new_instances = []
        new_attributes = self.attributes.copy()

        for i in self.instances:
            reading = i.get_reading_for_attribute(at.get_name())
            instance_val = reading.get_most_probable().get_name()
            if instance_val == value:
                new_readings = i.get_readings().copy()
                new_instances.append(Instance(new_readings))

        return Data(self.name, new_attributes, new_instances)

    def filter_numeric_attribute_value(self, at: Attribute, value: str, less_than: bool) -> 'Data':
        new_instances = []
        new_attributes = self.attributes.copy()

        for i in self.instances:
            reading = i.get_reading_for_attribute(at.get_name())
            instance_val = reading.get_most_probable().get_name()
            if less_than and float(instance_val) < float(value):
                new_readings = i.get_readings().copy()
                new_instances.append(Instance(new_readings))
            elif not less_than and float(instance_val) >= float(value):
                new_readings = i.get_readings().copy()
                new_instances.append(Instance(new_readings))

        return Data(self.name, new_attributes, new_instances)

    def get_attribute_of_name(self, att_name: str) -> Attribute:
        for at in self.attributes:
            if at.get_name() == att_name:
                return at
        return None

    def to_arff_most_probable(self) -> str:
        result = '@relation ' + self.name + '\n'
        for at in self.attributes:
            result += at.to_arff() + '\n'

        result += '@data\n'

        for i in self.instances:
            for r in i.get_readings():
                result += r.get_most_probable().get_name()
                result += ','
            result = result[:-1]  # delete the last coma ','
            result += '\n'
        return result

    def to_arff_skip_instance(self, epsilon: float) -> str:
        result = '@relation ' + self.name + '\n'
        for at in self.attributes:
            result += at.to_arff() + '\n'

        result += '@data\n'

        for i in self.instances:
            partial = ''
            for r in i.get_readings():
                if r.get_most_probable().get_confidence() > epsilon:
                    partial += r.get_most_probable().get_name()
                else:
                    break
                partial += ','
            result = result[:-1]  # delete the last coma ','
            result += partial + '\n'

        return result

    def to_arff_skip_value(self, epsilon: float) -> str:
        result = '@relation ' + self.name + '\n'
        for at in self.attributes:
            result += at.to_arff() + '\n'

        result += '@data\n'

        for i in self.instances:
            partial = ''
            for r in i.get_readings():
                if r.get_most_probable().get_confidence() > epsilon:
                    partial += r.get_most_probable().get_name()
                else:
                    partial += '?'
                partial += ','
            result = result[:-1]  # delete the last coma ','
            result += partial + '\n'

        return result

    def to_uarff(self) -> str:
        result = '@relation ' + self.name + '\n'
        for at in self.attributes:
            result += at.to_arff() + '\n'

        result += '@data\n'

        for i in self.instances:
            result += i.to_arff() + '\n'

        return result

    def calculate_statistics(self, att: Attribute) -> AttStats:
        return AttStats.get_statistics(att, self)

    @staticmethod
    def __read_uarff_from_buffer(br: (TextIOWrapper, StringIO)) -> 'Data':
        atts = []
        insts = []
        name = br.readline().split('@relation')[1].strip()
        for line in br:
            if len(line) == 1:
                continue
            att_split = line.strip().split('@attribute')
            if len(att_split) > 1:
                att = Data.parse_attribute(att_split[1].strip())
                atts.append(att)
            elif line.strip() == '@data':
                break

        # read instances
        for line in br:
            inst = Data.parse_instances(atts, line.strip())
            insts.append(inst)

        tmp_data = Data(name, atts, insts)
        tmp_data.update_attribute_domains()
        return tmp_data

    @staticmethod
    def __read_ucsv_from_dataframe(df: DataFrame, name: str) -> 'Data':
        atts = []
        insts = []
        cols = list(df.columns)
        for col in cols:
            records = set(df[col])
            records = set(re.sub(r'\[[0-9.]*]', '', str(rec)) for rec in records)
            records = list(records)
            if len(records) == 1:
                records = records[0].split(';')
            if len(records) > 10:
                att = col + ' @REAL'  # mark as a real value
            else:
                att = str(records).strip("'").strip('[').strip(']')
                att = col + ' {' + att + '}'
            att = Data.parse_attribute(att)
            atts.append(att)

        br = StringIO(df.to_string(index=False))
        br.readline()
        for line in br:
            line = re.sub(' +', ',', line.strip())
            inst = Data.parse_instances(atts, line)
            insts.append(inst)

        tmp_data = Data(name, atts, insts)
        tmp_data.update_attribute_domains()
        return tmp_data

    def update_attribute_domains(self):
        for a in self.get_attributes():
            if a.get_type() == Attribute.TYPE_NUMERICAL:
                domain = self.__get_domain_from_data(a, self.instances)
                a.set_domain(domain)

    def __get_domain_from_data(self, a: Attribute, instances: List[Instance]) -> Set[str]:
        domain = set()
        for i in instances:
            value = i.get_reading_for_attribute(a.get_name()).get_most_probable().get_name()
            domain.add(value)
        return domain

    @staticmethod
    def parse_ucsv(filename: str) -> 'Data':
        df = pd.read_csv(filename)
        name = filename.split('/')[-1].split('.csv')[0]
        out = Data.__read_ucsv_from_dataframe(df, name)
        return out

    @staticmethod
    def __parse(temp_data: 'Data', class_id: (int, str)) -> 'Data':
        # if class name is given
        if isinstance(class_id, str):
            class_att = temp_data.get_attribute_of_name(class_id)
            class_index = temp_data.attributes.index(class_att)

        # if class index is given
        elif isinstance(class_id, int):
            class_index = class_id
            class_att = temp_data.attributes[class_index]

        del temp_data.attributes[class_index]
        temp_data.attributes.append(class_att)
        # change order of reading for the att
        for i in temp_data.instances:
            class_label = i.get_reading_for_attribute(class_att.get_name())
            readings = i.get_readings()
            del readings[class_index]
            readings.append(class_label)
            i.set_readings(readings)
        return temp_data

    @staticmethod
    def parse_uarff_from_string(string: str, class_id: (int, str) = None) -> 'Data':
        try:
            br = StringIO(string)
        except:
            traceback.print_exc()
            return None
        temp_data = Data.__read_uarff_from_buffer(br)
        br.close()
        if not class_id:
            return temp_data

        return Data.__parse(temp_data, class_id)

    @staticmethod
    def parse_uarff(filename: str, class_id: (int, str) = None) -> 'Data':
        try:
            br = open(filename)
        except:
            traceback.print_exc()
            return None
        temp_data = Data.__read_uarff_from_buffer(br)
        br.close()
        if not class_id:
            return temp_data

        return Data.__parse(temp_data, class_id)

    @staticmethod
    def parse_instances(base_atts: List[Attribute], inst_def: str) -> Instance:
        readings_defs = inst_def.split(',')
        i = Instance()
        if len(readings_defs) != len(base_atts):
            raise ParseException('Missing attribute definition, or value in line ' + inst_def)
        for reading, att in zip(readings_defs, base_atts):
            r = Reading.parse_reading(att, reading)
            i.add_reading(r)
        return i

    @staticmethod
    def parse_attribute(att_def: str) -> Attribute:
        name_boundary = int(att_def.index(' '))
        type = Attribute.TYPE_NOMINAL
        name = att_def[0:name_boundary]
        domain = set()
        untrimmed_domain = re.sub(r'[{}]', '',  att_def[name_boundary:]).split(',')
        for value in untrimmed_domain:
            if value.strip() == Data.REAL_DOMAIN:
                type = Attribute.TYPE_NUMERICAL
                break
            domain.add(value.replace("'", '').strip())
        return Attribute(name, domain, type)

    def get_instances(self) -> List[Instance]:
        return self.instances.copy()

    def get_attributes(self) -> List[Attribute]:
        return self.attributes.copy()

    def get_name(self) -> str:
        return self.name

    def get_class_attribute(self) -> Attribute:
        return self.attributes[-1]  # get last element


## Examples

### CSV

In [None]:
data = Data.parse_ucsv("../resources/bmi.csv")
print(data.to_uarff())

@relation bmi
@attribute gender {female,male}
@attribute height {short,medium,tall}
@attribute weight {heavy,light,normal}
@attribute index {2,5,3,4,1,0}
@data
male[1];female[0.0],medium[1];short[0.0];tall[0.0],normal[1];heavy[0.0];light[0.0],4[1];2[0.0];5[0.0];0[0.0];1[0.0];3[0.0]

male[1];female[0.0],tall[1];short[0.0];medium[0.0],light[1];heavy[0.0];normal[0.0],2[1];5[0.0];0[0.0];1[0.0];3[0.0];4[0.0]

female[1];male[0.0],tall[1];short[0.0];medium[0.0],heavy[1];light[0.0];normal[0.0],4[1];2[0.0];5[0.0];0[0.0];1[0.0];3[0.0]

female[1];male[0.0],tall[1];short[0.0];medium[0.0],normal[1];heavy[0.0];light[0.0],3[1];2[0.0];5[0.0];0[0.0];1[0.0];4[0.0]

male[1];female[0.0],short[1];medium[0.0];tall[0.0],light[1];heavy[0.0];normal[0.0],3[1];2[0.0];5[0.0];0[0.0];1[0.0];4[0.0]

male[1];female[0.0],tall[1];short[0.0];medium[0.0],normal[1];heavy[0.0];light[0.0],3[1];2[0.0];5[0.0];0[0.0];1[0.0];4[0.0]

male[1];female[0.0],short[1];medium[0.0];tall[0.0],light[1];heavy[0.0];normal[0.0],5[1];2[0.0];0

### uCSV

In [None]:
data = Data.parse_ucsv("../resources/bmi_uncertain.csv")
print(data.to_uarff())

@relation bmi_uncertain
@attribute gender {female,male}
@attribute height {short,medium,tall}
@attribute weight {heavy,light,normal}
@attribute index {2,5,3,4,1,0}
@data
male[1];female[0.0],short[0.2880635477276248];medium[0.6323525785080119];tall[0.07958387376436318],light[0.06075346632519163];normal[0.6950714126773211];heavy[0.24417512099748714],4[1];2[0.0];5[0.0];0[0.0];1[0.0];3[0.0]

male[1];female[0.0],short[0.5427324030065072];medium[0.29870344547393346];tall[0.1585641515195594],light[0.3049133205059182];normal[0.13444820788031822];heavy[0.5606384716137637],2[1];5[0.0];0[0.0];1[0.0];3[0.0];4[0.0]

female[1];male[0.0],short[0.46319097170492274];medium[0.37880074867207736];tall[0.1580082796229999],light[0.2867593922579523];normal[0.42122911103381444];heavy[0.29201149670823323],4[1];2[0.0];5[0.0];0[0.0];1[0.0];3[0.0]

female[1];male[0.0],short[0.033018199445915136];medium[0.8575228555533172];tall[0.10945894500076765],light[0.3779681943089925];normal[0.3582109701899303];heavy[0.26382

### ARFF

In [None]:
data = Data.parse_uarff("../resources/weather.numeric.arff")
print(data.to_uarff())

@relation weather
@attribute outlook {sunny,overcast,rainy}
@attribute temperature {@REAL}
@attribute humidity {@REAL}
@attribute windy {FALSE,TRUE}
@attribute play {no,yes}
@data
sunny[1];overcast[0.0];rainy[0.0],150[1];@REAL[0.0],86[1];@REAL[0.0],TRUE[1];FALSE[0.0],no[1];yes[0.0]

sunny[1];overcast[0.0];rainy[0.0],100[1];@REAL[0.0],90[1];@REAL[0.0],TRUE[1];FALSE[0.0],no[1];yes[0.0]

overcast[1];sunny[0.0];rainy[0.0],83[1];@REAL[0.0],86[1];@REAL[0.0],FALSE[1];TRUE[0.0],yes[1];no[0.0]

rainy[1];sunny[0.0];overcast[0.0],70[1];@REAL[0.0],96[1];@REAL[0.0],FALSE[1];TRUE[0.0],yes[1];no[0.0]

rainy[1];sunny[0.0];overcast[0.0],68[1];@REAL[0.0],80[1];@REAL[0.0],FALSE[1];TRUE[0.0],yes[1];no[0.0]

rainy[1];sunny[0.0];overcast[0.0],15[1];@REAL[0.0],96[1];@REAL[0.0],TRUE[1];FALSE[0.0],no[1];yes[0.0]

sunny[1];overcast[0.0];rainy[0.0],64[1];@REAL[0.0],65[1];@REAL[0.0],TRUE[1];FALSE[0.0],yes[1];no[0.0]

sunny[1];overcast[0.0];rainy[0.0],12[1];@REAL[0.0],95[1];@REAL[0.0],TRUE[1];FALSE[0.0],no[1];yes[

### uARFF

In [None]:
data = Data.parse_uarff("../resources/machine.nominal.uncertain.arff")
print(data.to_uarff())

@relation machine.symbolic
@attribute speed {high,normal,low}
@attribute temperature {high,normal,low}
@attribute load {high,normal}
@attribute oil {normal,low}
@attribute crash_risk {no,yes}
@data
high[0.5];low[0.3];normal[0.2],high[1];normal[0.0];low[0.0],high[1];normal[0.0],low[1];normal[0.0],yes[1];no[0.0]

high[1];normal[0.0];low[0.0],high[1];normal[0.0];low[0.0],high[1];normal[0.0],low[1];normal[0.0],yes[1];no[0.0]

normal[0.2];high[0.7];low[0.1],high[1];normal[0.0];low[0.0],high[1];normal[0.0],normal[1];low[0.0],no[1];yes[0.0]

low[1];high[0.0];normal[0.0],normal[1];high[0.0];low[0.0],high[1];normal[0.0],normal[1];low[0.0],no[1];yes[0.0]

low[1];high[0.0];normal[0.0],low[1];high[0.0];normal[0.0],normal[1];high[0.0],normal[0.3];low[0.7],no[1];yes[0.0]

low[1];high[0.0];normal[0.0],low[1];high[0.0];normal[0.0],normal[1];high[0.0],low[1];normal[0.0],yes[1];no[0.0]

normal[1];high[0.0];low[0.0],low[1];high[0.0];normal[0.0],normal[1];high[0.0],low[1];normal[0.0],no[1];yes[0.0]

high[