In [1]:
import csv
from pprint import pprint
from math import log, exp

In [2]:
TRAINSET = "TrainsetTugas1ML.csv"
TESTSET = "TestsetTugas1ML.csv"
TEBAKAN = "TebakanTugas1ML.csv"
OUTPUT_ATTR = "income"

In [3]:
def read_dict_from_csv(csv_file):
    with open(csv_file) as cf:
        csv_reader = csv.DictReader(cf, skipinitialspace=True)
        data = [dict(row) for row in csv_reader]
    return data

In [4]:
def write_list_to_csv(csv_file, data):
    with open(csv_file, mode='w') as cf:
        for d in data: cf.write(d + '\n')

In [5]:
train_data = read_dict_from_csv(TRAINSET)
train_data[:10]

[{'id': '4776',
  'age': 'young',
  'workclass': 'Private',
  'education': 'Some-college',
  'marital-status': 'Married-civ-spouse',
  'occupation': 'Prof-specialty',
  'relationship': 'Husband',
  'hours-per-week': 'normal',
  'income': '>50K'},
 {'id': '8173',
  'age': 'adult',
  'workclass': 'Private',
  'education': 'Bachelors',
  'marital-status': 'Never-married',
  'occupation': 'Prof-specialty',
  'relationship': 'Not-in-family',
  'hours-per-week': 'normal',
  'income': '>50K'},
 {'id': '23423',
  'age': 'young',
  'workclass': 'Private',
  'education': 'Some-college',
  'marital-status': 'Married-civ-spouse',
  'occupation': 'Prof-specialty',
  'relationship': 'Husband',
  'hours-per-week': 'normal',
  'income': '>50K'},
 {'id': '1818',
  'age': 'adult',
  'workclass': 'Private',
  'education': 'HS-grad',
  'marital-status': 'Married-civ-spouse',
  'occupation': 'Craft-repair',
  'relationship': 'Husband',
  'hours-per-week': 'normal',
  'income': '<=50K'},
 {'id': '14894',
  

In [6]:
attributes = {attr: set(d[attr] for d in train_data) for attr in train_data[0] if attr not in {"id", OUTPUT_ATTR}}
attributes

{'age': {'adult', 'old', 'young'},
 'workclass': {'Local-gov', 'Private', 'Self-emp-not-inc'},
 'education': {'Bachelors', 'HS-grad', 'Some-college'},
 'marital-status': {'Divorced', 'Married-civ-spouse', 'Never-married'},
 'occupation': {'Craft-repair', 'Exec-managerial', 'Prof-specialty'},
 'relationship': {'Husband', 'Not-in-family', 'Own-child'},
 'hours-per-week': {'low', 'many', 'normal'}}

In [7]:
output_classes = set(d[OUTPUT_ATTR] for d in train_data)
output_classes

{'<=50K', '>50K'}

In [8]:
output_frequency = {out: sum(1 for d in train_data if d[OUTPUT_ATTR] == out) for out in output_classes}
output_frequency

{'<=50K': 40, '>50K': 120}

In [9]:
output_probability = {out: output_frequency[out] / len(train_data) for out in output_classes}
output_probability

{'<=50K': 0.25, '>50K': 0.75}

In [10]:
class_probability = {
    attr: {
        cls: {
            out: sum(1 for d in train_data if d[attr] == cls and d[OUTPUT_ATTR] == out) / output_frequency[out] 
            for out in output_classes
        } for cls in attributes[attr]
    } for attr in attributes
}
pprint(class_probability)

{'age': {'adult': {'<=50K': 0.475, '>50K': 0.44166666666666665},
         'old': {'<=50K': 0.025, '>50K': 0.008333333333333333},
         'young': {'<=50K': 0.5, '>50K': 0.55}},
 'education': {'Bachelors': {'<=50K': 0.175, '>50K': 0.5416666666666666},
               'HS-grad': {'<=50K': 0.425, '>50K': 0.23333333333333334},
               'Some-college': {'<=50K': 0.4, '>50K': 0.225}},
 'hours-per-week': {'low': {'<=50K': 0.125, '>50K': 0.025},
                    'many': {'<=50K': 0.05, '>50K': 0.008333333333333333},
                    'normal': {'<=50K': 0.825, '>50K': 0.9666666666666667}},
 'marital-status': {'Divorced': {'<=50K': 0.175, '>50K': 0.041666666666666664},
                    'Married-civ-spouse': {'<=50K': 0.475, '>50K': 0.9},
                    'Never-married': {'<=50K': 0.35,
                                      '>50K': 0.058333333333333334}},
 'occupation': {'Craft-repair': {'<=50K': 0.525, '>50K': 0.26666666666666666},
                'Exec-managerial': {'<=50K': 

In [11]:
test_data = read_dict_from_csv(TESTSET)
test_data[:10]

[{'id': '26027',
  'age': 'young',
  'workclass': 'Private',
  'education': 'HS-grad',
  'marital-status': 'Never-married',
  'occupation': 'Craft-repair',
  'relationship': 'Not-in-family',
  'hours-per-week': 'normal'},
 {'id': '26314',
  'age': 'young',
  'workclass': 'Private',
  'education': 'Bachelors',
  'marital-status': 'Divorced',
  'occupation': 'Exec-managerial',
  'relationship': 'Not-in-family',
  'hours-per-week': 'normal'},
 {'id': '31405',
  'age': 'young',
  'workclass': 'Private',
  'education': 'Bachelors',
  'marital-status': 'Married-civ-spouse',
  'occupation': 'Prof-specialty',
  'relationship': 'Husband',
  'hours-per-week': 'normal'},
 {'id': '14736',
  'age': 'adult',
  'workclass': 'Private',
  'education': 'Some-college',
  'marital-status': 'Divorced',
  'occupation': 'Prof-specialty',
  'relationship': 'Not-in-family',
  'hours-per-week': 'normal'},
 {'id': '27217',
  'age': 'young',
  'workclass': 'Private',
  'education': 'HS-grad',
  'marital-status': 

In [12]:
for d in test_data:
    out_prob = { out: exp(sum(map(log, (class_probability[attr][d[attr]][out] for attr in attributes)))) * output_probability[out] for out in output_classes }
    d[OUTPUT_ATTR] = max(out_prob, key=out_prob.get)
test_data
output_list = list(map(lambda x: x[OUTPUT_ATTR], test_data))
print(output_list)

['<=50K', '<=50K', '>50K', '<=50K', '>50K', '>50K', '<=50K', '<=50K', '>50K', '>50K', '>50K', '>50K', '<=50K', '>50K', '>50K', '>50K', '<=50K', '>50K', '<=50K', '>50K', '>50K', '>50K', '>50K', '>50K', '>50K', '>50K', '>50K', '>50K', '>50K', '<=50K', '<=50K', '<=50K', '>50K', '>50K', '<=50K', '>50K', '<=50K', '>50K', '>50K', '>50K']


In [13]:
write_list_to_csv(TEBAKAN, output_list)