In [1]:
import json
import csv
import pandas as pd
import numpy as np

In [2]:
# Reading a dataset containing indices and values

with open('plain_values.csv', 'r+') as f:
    values_data = pd.read_csv(f, index_col = False)

In [3]:
# Viewing the dataframe
values_data

Unnamed: 0,Index,Value
0,1,20.4624
1,2,5.6
2,3,6.781
3,4,18.44
4,4,12.44
5,5,2.5
6,6,2.2
7,7,3.0
8,8,2.0
9,9,16.6


In [4]:
with open('plain_values.csv', 'r+') as f:
    values_data = f.readlines()
    
print('values_data:' + str(values_data))

data = []
for item in values_data:
    store = item.strip().split(',')
    data.append(store)
    
print()
print('data: ' + str(data))

data = np.asarray(data)

header = data[0, :]
data = data[1:, :]

print()
print('header: ' + str(header))
print('data[0]: ' + str(data[0]))

values_data:['Index,Value\n', '1,20.4624\n', '2,5.6\n', '3,6.7810\n', '4,18.44\n', '4,12.44\n', '5,2.5\n', '6,2.2\n', '7,3.0\n', '8,2\n', '9,16.6\n', '10,7.8923\n']

data: [['Index', 'Value'], ['1', '20.4624'], ['2', '5.6'], ['3', '6.7810'], ['4', '18.44'], ['4', '12.44'], ['5', '2.5'], ['6', '2.2'], ['7', '3.0'], ['8', '2'], ['9', '16.6'], ['10', '7.8923']]

header: ['Index' 'Value']
data[0]: ['1' '20.4624']


In [5]:
def check_index(index_column, view = True):
    
    list_index = np.ndarray.tolist(index_column)
    set_index = set(list_index)
    
    if view == True:
        print('Number of indices: ' + str(len(list_index)))
        print('Number of unique indices: ' + str(len(set_index)))
    
    return len(list_index) == len(set_index)

def get_least_count(value_string):
    if '.' in value_string:
        return len(value_string.split('.')[1])
    
    return 0
    
from statistics import mode

def get_column_LC(column):
    column = np.ndarray.tolist(column)
    LCs = []
    
    for value_string in column:
        LC = get_least_count(value_string)
        LCs.append(LC)
    
    return mode(LCs)

In [6]:
# Analyzing

indices = data[:, 0]
values = data[:, 1]

is_valid_index = check_index(indices)
least_count = get_column_LC(values)

print('Most Common Least Count: ' + str(least_count) + ' places of decimal')

Number of indices: 11
Number of unique indices: 10
Most Common Least Count: 1 places of decimal


In [7]:
# Rounding to least_count places of decimal

def get_format_code(least_count):
    return "{:." + str(least_count) + "f}"

def round_up(column, least_count):
    column = np.ndarray.tolist(column)
    
    rounded = []
    
    for value_string in column:
        value = float(value_string)
        
        value = np.round(value, least_count)
        formatted_float = get_format_code(least_count).format(value)
        
        rounded.append(formatted_float)
        
    return rounded

In [8]:
get_format_code(1)

'{:.1f}'

In [9]:
# Behavior of get_format_code
print('get_format_code(least_count): ' + get_format_code(least_count))

rounded_up = round_up(values, least_count)
print('rounded_up: ' + str(rounded_up))

get_format_code(least_count): {:.1f}
rounded_up: ['20.5', '5.6', '6.8', '18.4', '12.4', '2.5', '2.2', '3.0', '2.0', '16.6', '7.9']


In [10]:
# Mapping to a range instead

def give_range(value, jump = 5, start = 0):  
    value = int(value - start)
    times = int(value / jump)
    
    lower = start + jump * times
    upper = lower + jump
    
    return str(lower) + ' - ' + str(upper)

def get_ranges(column, jump = 5, start = 0):
    column = np.ndarray.tolist(column)
    ranges = []
    
    for value_string in column:
        value = float(value_string)
        
        ranges.append(give_range(value))
        
    return ranges

ranged_up = get_ranges(values)
print('get_ranges(values)[0: 3]: ' + str(ranged_up[0: 3]))

get_ranges(values)[0: 3]: ['20 - 25', '5 - 10', '5 - 10']


In [11]:
# Using cleaned data

rounded_frame = pd.DataFrame(rounded_up, columns = ['Value'])
range_frame = pd.DataFrame(ranged_up, columns = ['Value'])

rounded_frame.to_csv('rounded_values.csv')
range_frame.to_csv('ranged_values.csv')

# Viewing range_frame
range_frame

Unnamed: 0,Value
0,20 - 25
1,5 - 10
2,5 - 10
3,15 - 20
4,10 - 15
5,0 - 5
6,0 - 5
7,0 - 5
8,0 - 5
9,15 - 20


In [12]:
# Inconsistencies

with open('lists_data.csv', 'r+') as f:
    lists_data = pd.read_csv(f, index_col = False)

# Displaying lists_data
lists_data

Unnamed: 0,Index,Recordings
0,1,"[10.1, 5.2, 9.7]"
1,2,"[8.9, 7.4, 7.5]"
2,3,"[9.2, 6.3, 6.9]"
3,4,"[10.2, 6.7, 6.8, 8.0]"
4,5,"[6.6, 5.4, 9.5]"


In [13]:
indices = lists_data.Index
data = lists_data.Recordings.tolist()

lists = []
view = True

for item in data:
    if view:
        print(type(item))
    
    # Require lists
    records = json.loads(item)
    
    if view:
        print(type(records))
    
    lists.append(records)
    
print('Extracted list of lists: ')
print(lists)

<class 'str'>
<class 'list'>
<class 'str'>
<class 'list'>
<class 'str'>
<class 'list'>
<class 'str'>
<class 'list'>
<class 'str'>
<class 'list'>
Extracted list of lists: 
[[10.1, 5.2, 9.7], [8.9, 7.4, 7.5], [9.2, 6.3, 6.9], [10.2, 6.7, 6.8, 8.0], [6.6, 5.4, 9.5]]


In [14]:
def get_lengths(lists):
    # Returns a map from length to record lists of that length
    length_map = {}
    
    for index, item in enumerate(lists):
        length = len(item)
        
        if length not in length_map.keys():
            length_map[length] = []
            
        length_map[length].append(index)
        
    return length_map
        
def is_unique(length_map):
    unique_lengths = len(length_map.keys())
    
    if unique_lengths > 1:
        return False
    
    return True

length_information = get_lengths(lists)
print('Length Consistency is Preserved: ' + str(is_unique(length_information)))
print('View: ' + str(length_information))

Length Consistency is Preserved: False
View: {3: [0, 1, 2, 4], 4: [3]}


In [15]:
# With JSONs on each line

with open('dictionary_data.txt', 'r+') as f:
    raw_dict_data = f.readlines()

dict_data = []
view = False

for item in raw_dict_data:
    unit = item.strip()
    
    if view:
        print(type(unit))
    
    dict_item = json.loads(unit)
    dict_data.append(dict_item)
    
    if view:
        print(type(dict_item))
        
# Displaying dict_data
print('dict_data: ')
print(dict_data)

dict_data: 
[{'name': 'Nikhil', 'temp': '97.4'}, {'name': 'Sayar', 'temp': '97.2'}, {'name': 'Himu', 'temperature': '97.6'}, {'name': 'Kripa', 'temp': '97.1'}, {'name': 'Ujwal', 'temp': '97.9'}]


In [16]:
# Given a set of expected keys
expected_keys = ['name', 'temp']

mismatch = []

# Records that deviate from the set
for index, record in enumerate(dict_data):
    record_keys = [key for key in record.keys()]
    
    if not record_keys == expected_keys:
        mismatch.append(index)
        
print('Records in dict_data are consistent: ' + str(len(mismatch) == 0))
print('Indices of records that do not contain expected keys: ' + str(mismatch))

Records in dict_data are consistent: False
Indices of records that do not contain expected keys: [2]


In [17]:
# Logical Consistency

with open('triangle.csv', 'r+') as f:
    raw_triangles = pd.read_csv(f)
    
raw_triangles

Unnamed: 0,a,b,c,isRightAngled?
0,5.1,6.4,8.2,Yes
1,2.5,2.5,2.5,No
2,2.3,4.5,5.1,No
3,1.3,1.3,0.6,Yes
4,4.6,5.2,6.9,Yes
5,2.5,2.5,5.1,No
6,3.0,4.0,-5.0,Yes
7,1.2,-1.2,3.2,No
8,3.8,1.4,4.0,Yes
9,4.7,6.5,8.0,Yes


In [18]:
def sanity(sides):
    for side in sides:
        try:
            assert float(side) > 0
        except Exception as E:
            return False
    return True

def is_triangle(sides):
    sides.sort()
    
    if sides[-1] >= sides[0] + sides[1]:
        return False

    return True

def isRightAngled(sides):
    sides.sort()
    
    squares = [np.power(side, 2) for side in sides]
    
    expected_hypotenuse = np.round(np.sqrt(squares[0] + squares[1]), 1)
    
    if expected_hypotenuse == np.round(sides[-1], 1):
        return 'Yes'
    
    else:
        return 'No'

In [19]:
records = raw_triangles.values.tolist()

sanity_violated = []
not_triangle = []
wrong_labels = []

for index, record in enumerate(records):
    sides = record[0: 3]
    is_right = record[3]
    
    if not sanity(sides):
        sanity_violated.append(index)
        continue
        
    if not is_triangle(sides):
        not_triangle.append(index)
        continue
        
    if not is_right == isRightAngled(sides):
        wrong_labels.append(index)

In [20]:
print('Sanity Checks Violated: ' + str(sanity_violated))
print('Not a proper triangle: ' + str(not_triangle))
print('Wrong label for isRightAngled?: ' + str(wrong_labels))

Sanity Checks Violated: [6, 7]
Not a proper triangle: [5]
Wrong label for isRightAngled?: [2, 3]


In [21]:
# That's it