# Data Frame system
### Stefan-Cristian Bordei

# Code

In [1]:
import numbers


class MySeries:
    def __init__(self, data, index=None, name=None):
        """
        Indexed series stored as a dict.

        :param data: list, dict
                    Contains data stored in series.
        :param index: array-like, optional
        """

        assert data is not None, "please provide data"
        assert isinstance(data, list) or isinstance(data, dict), \
            "data must be list or dict"
        assert index is None or len(data) == len(index), \
            "index len must be equal to data len"

        self.s_dict = {}
        self.name = name

        if isinstance(data, dict):
            self.s_dict = data
        else:
            # if data is a list check if an index list has been
            # provided and map the entries or map against default
            # indices
            idx = index if index is not None else range(len(data))
            for i, k in zip(idx, data):
                self.s_dict[i] = k

    def min(self):
        assert all(isinstance(val, numbers.Number) for val in self.s_dict.values()), \
            "min must be performed on numeric values"
        return min(self.s_dict.values())

    def max(self):
        assert all(isinstance(val, numbers.Number) for val in self.s_dict.values()), \
            "max must be performed on numeric values"
        return max(self.s_dict.values())

    def mean(self):
        assert all(isinstance(val, numbers.Number) for val in self.s_dict.values()), \
            "mean must be performed on numeric values"
        return sum(self.s_dict.values()) / len(self.s_dict.values())

    def print(self):
        for k, v in self.s_dict.items():
            print(f"{k}\t", end='')
            if isinstance(v, list):
                for item in v:
                    print(f"{item}\t", end='')
            else:
                print(f"{v}\t", end='')
            print()

    def item_at_ind(self, i):
        return self.s_dict[i]

    def size(self):
        return len(self.s_dict.values())

    def sort_by_index(self, index_list):
        keys = list(self.s_dict.keys())
        val = list(self.s_dict.values())
        sorted_dict = {}

        sorted_val = [val[j] for j in index_list]
        for k, v in zip(keys, sorted_val):
            sorted_dict[k] = v
        self.s_dict = sorted_dict

In [2]:
class MyDataFrame:
    def __init__(self, data, index=None):
        """
            Dataframe two-dimensional tabular data.
        :param data: dict
                    Contains data stored in DataFrame
        :param index: array-like, optional
                    Contains row label list
        """
        assert isinstance(data, dict), "data must be a dict"
        assert index is None or all(len(index) == len(val) for val in data.values()), \
            "the number of rows (index) must match the number items in data values"

        # DataFrame columns with name/label (MySeries list)
        self.columns = []
        for k, v in data.items():
            self.columns.append(MySeries(v, name=k))

        assert all(self.columns[0].size() == val.size() for val in self.columns), \
            "data values must have same len"

        # DataFrame row labels
        self.index = index if index is not None else range(self.columns[0].size())

    def min(self):
        for series in self.columns:
            try:
                print(f"{series.name:<12}\t{series.min():>12.2f}")
            except AssertionError:
                # skip invalid column types
                continue

    def max(self):
        for series in self.columns:
            try:
                print(f"{series.name:<12}\t{series.max():>12.2f}")
            except AssertionError:
                # skip invalid column types
                continue

    def mean(self):
        for series in self.columns:
            try:
                print(f"{series.name:<12}\t{series.mean():>12.2f}")
            except AssertionError:
                # skip invalid column types
                continue

    def print(self):
        print('\t', end='')
        for series in self.columns:
            print("{:^17} ".format(series.name), end='')
        print()
        for i in range(len(self.index)):
            print(f"{self.index[i]:<13}", end='')
            for j in range(len(self.columns)):
                print(f"{self.columns[j].item_at_ind(i):<19}", end='')
            print()

    def sort_values(self, column_name):
        sorted_index_list = []

        # find column to sort by and build the
        # corresponding ordered list of indices
        for col in self.columns:
            if col.name == column_name:
                sorted_index_list = [i[0] for i in sorted(enumerate(col.s_dict.values()),
                                                          key=lambda x: x[1])]
        if not sorted_index_list:
            print(f"Column {column_name} not found. No sorting will be performed.")
            return

        # sort all column series and the df index by sorted_index_list
        for col in self.columns:
            col.sort_by_index(sorted_index_list)
        self.index = [self.index[x] for x in sorted_index_list]

# Running the code on samples provided

## MySeries

### Sample 1

In [3]:
ms3 = MySeries([1,2,1], index = ['a','b','c'])
ms3.s_dict

{'a': 1, 'b': 2, 'c': 1}

In [4]:
ms4 = MySeries([4,5,6])
ms4.s_dict

{0: 4, 1: 5, 2: 6}

In [5]:
d = {'b': 1, 'a': 0, 'c': 2}
s2 = MySeries(d)
s2.s_dict

{'b': 1, 'a': 0, 'c': 2}

### Sample 2 (not provided but checks min, max and mean on ms3)

In [6]:
ms3.min()

1

In [7]:
ms3.max()

2

In [8]:
ms3.mean()

1.3333333333333333

### Sample 3

In [9]:
ms3 = MySeries([1,2,1], index = ['a','b','c'])
ms3.print()

a	1	
b	2	
c	1	


In [10]:
ms3.item_at_ind('c')

1

## MyDataFrame

### Sample 1

In [11]:
d = {'Sun Hours': [4.5,4.0,5.1,5],
     'Max Temp': [19.6,19.1,19.6,20.0],
     'Min Temp': [12.7,12.5,13.3,12.1],
     'Rain (mm)': [82,109,65,76],
     'Rain Days': [13,20,10,9.7]}
df1 = MyDataFrame(d)
df2 = MyDataFrame(d, index = ['Clare', 'Galway','Dublin', 
  'Wexford'])

In [12]:
df2.print()

	    Sun Hours         Max Temp          Min Temp          Rain (mm)         Rain Days     
Clare        4.5                19.6               12.7               82                 13                 
Galway       4.0                19.1               12.5               109                20                 
Dublin       5.1                19.6               13.3               65                 10                 
Wexford      5                  20.0               12.1               76                 9.7                


### Sample 2

In [13]:
df2.sort_values('Rain (mm)')
df2.print()

	    Sun Hours         Max Temp          Min Temp          Rain (mm)         Rain Days     
Dublin       5.1                19.6               13.3               65                 10                 
Wexford      5                  20.0               12.1               76                 9.7                
Clare        4.5                19.6               12.7               82                 13                 
Galway       4.0                19.1               12.5               109                20                 


In [14]:
df2.mean()

Sun Hours   	        4.65
Max Temp    	       19.58
Min Temp    	       12.65
Rain (mm)   	       83.00
Rain Days   	       13.18


In [15]:
df2.max()

Sun Hours   	        5.10
Max Temp    	       20.00
Min Temp    	       13.30
Rain (mm)   	      109.00
Rain Days   	       20.00


In [16]:
df2.min()

Sun Hours   	        4.00
Max Temp    	       19.10
Min Temp    	       12.10
Rain (mm)   	       65.00
Rain Days   	        9.70


### Sample 3 (films)

In [17]:
films = {'Rank': [112,62,41,172,230,176],
        'Release Year': [1973,1980,1960,2015,1976,1996],
        'IMDB Rating': [8.3,8.4,8.5,8.1,8.1,8.1],
        'Time (minutes)': [129,146,109,118,120,98],
        'Main Genre': ['Comedy','Horror','Horror','Drama','Drama','Drama']}
f_names = ['Sting','Shining', 'Psycho','Room','Rocky','Fargo']

films_df =  MyDataFrame(films, index = f_names) 
films_df.print()

	      Rank          Release Year       IMDB Rating     Time (minutes)      Main Genre     
Sting        112                1973               8.3                129                Comedy             
Shining      62                 1980               8.4                146                Horror             
Psycho       41                 1960               8.5                109                Horror             
Room         172                2015               8.1                118                Drama              
Rocky        230                1976               8.1                120                Drama              
Fargo        176                1996               8.1                98                 Drama              


In [18]:
films_df.mean()

Rank        	      132.17
Release Year	     1983.33
IMDB Rating 	        8.25
Time (minutes)	      120.00


In [19]:
films_df.sort_values('Release Year')
films_df.print()

	      Rank          Release Year       IMDB Rating     Time (minutes)      Main Genre     
Psycho       41                 1960               8.5                109                Horror             
Sting        112                1973               8.3                129                Comedy             
Rocky        230                1976               8.1                120                Drama              
Shining      62                 1980               8.4                146                Horror             
Fargo        176                1996               8.1                98                 Drama              
Room         172                2015               8.1                118                Drama              


### min, max (not included in samples provided)

In [20]:
films_df.min()

Rank        	       41.00
Release Year	     1960.00
IMDB Rating 	        8.10
Time (minutes)	       98.00


In [21]:
films_df.max()

Rank        	      230.00
Release Year	     2015.00
IMDB Rating 	        8.50
Time (minutes)	      146.00
