# The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2019 Semester 1
-----
## Project 1: Gaining Information about Naive Bayes
-----
###### Student Name(s):
###### Python version:
###### Submission deadline: 1pm, Fri 5 Apr 2019

This iPython notebook is a template which you may use for your Project 1 submission. (You are not required to use it; in particular, there is no need to use iPython if you do not like it.)

Marking will be applied on the five functions that are defined in this notebook, and to your responses to the questions at the end of this notebook.

You may change the prototypes of these functions, and you may write other functions, according to your requirements. We would appreciate it if the required functions were prominent/easy to find. 

### implement the naive nayes classifier

In [381]:
# This function should open a data file in csv, and transform it into a usable format 
import csv
import numpy as np
def preprocess(data_path, split_ratio=0.2, seed=None):
    '''
    prpross the data from data_path --- a csv format csv file
    @data_path: the path of the csv format file
    @split_ratio: the ratio of the val data 
    '''
    #read data from csv
    with open(data_path, newline='') as csvfile:
        lines = csv.reader(csvfile, delimiter=',')  
        matrix = np.array(list(lines))
    print("The shape of the inputdata: ",matrix.shape)
    classes = matrix[:,-1]
    print("The calsses of the data: ",set(classes))
    
    #shuffle the data by using the random seed or if seed == None random else use the seed th shuffle
    if seed != None:
        np.random.seed(seed)
    np.random.shuffle(matrix)
    
    #split the data into train and validation
    train, val = matrix[:int(matrix.shape[0]*(1-split_ratio)),:], matrix[int(matrix.shape[0]*(1-split_ratio)):,:]
    print("train : val = ",str(len(train)) + " : " + str(len(val)))
    
    #split the attribues and the classes
    train_X, train_y = train[:,:-1], train[:,-1]
    val_X, val_y = val[:,:-1], val[:,-1]
    return train_X, train_y, val_X, val_y

In [382]:
# This function should build a supervised NB model\
import collections
def train(train_X, train_y):
    '''
    to train a classicifer by using the train data
    @train_X: train data
    @train_y: the classes of the train
    '''
    assert train_X.shape[0] == train_y.shape[0]
    attrs_dict = collections.defaultdict(list)
    cls_dict = collections.defaultdict(int)
    
    #get the class prob matrix
    for attr, cls in zip(train_X, train_y):
        attrs_dict[cls].append(attr)
        cls_dict[cls] += 1
    cls_preds = {key:len(val)/train_y.shape[0] for key, val in attrs_dict.items()}
    
    #ingore the missing value, Conditional probability template for a class
    attributs_list = [{key:0 for key in set(train_X[:,i]) if key != "?"} for i in range(train_X.shape[1])]
    
    #get the condition prob matrix
    cond_dict = {}
    for cls, attrs in attrs_dict.items():
        attrs = np.array(attrs)
        cls_cond = [{key:0 for key in set(train_X[:,i]) if key != "?"} for i in range(train_X.shape[1])]
        cond_length = len(attrs)
        for i in range(train_X.shape[1]):
            col =attrs[:,i]
            for key in cls_cond[i].keys():
                if key == "?":
                    continue
                else:
                    cls_cond[i][key] = (list(col[col != "?"]).count(key) + 0) / (cond_length + 0)# 1  and len(set(col)) and appliment add_d smooth
        cond_dict[cls] = cls_cond
    
    return cls_preds, cond_dict


In [383]:
# This function should predict the class for an instance or a set of instances, based on a trained model 
def predict(x, cls_preds, cond_dict):
    '''
    predict the class for an instance
    @x : an instance
    @cls_preds, @cond_dict : a trained model 
    '''
    classes = list(cls_preds.keys())
    preds = []
    for cls, cls_cond in cond_dict.items():
        p1 = cls_preds[cls]
        p2 = 1
        for i in range(len(cls_cond)):
            try:
                p2 = p2 * cls_cond[i][x[i]]
            except KeyError:
                p2 *= 1
        preds.append(p1 * p2) 
    return classes[preds.index(max(preds))]


In [384]:
# This function should evaluate a set of predictions, in a supervised context 
def evaluate(val_X, val_y, cls_preds, cond_dict):
    assert len(val_X) == len(val_y)
    count = 0
    for x, y in zip(val_X, val_y):
        try:
            if predict(x, cls_preds, cond_dict) == y:
                count += 1
        except:
            print(x)
    return round(count / len(val_X), 4)


In [396]:
# This function should calculate the Information Gain of an attribute or a set of attribute, with respect to the class
def entropy(probabilitys): 
    '''Computing information entropy using a probabilitys list'''
    return sum(-1 *  np.array(probabilitys) * np.log2(probabilitys))

def cond_entropy(attr, train_X, train_y):
    '''Computing conditional entropy using a dataset'''
    target_set = list(set(train_y))
    attr_dict = collections.defaultdict(int)
    for attr_cls in train_X[:,attr]:
        attr_dict[attr_cls] += 1 
    cls_prob_dict = {key:value/sum(attr_dict.values())   for key, value in attr_dict.items()}
    assert abs(sum(cls_prob_dict.values()) - 1) < 0.00001 , sum(v)
    
    dict = collections.defaultdict(list)
    for x, y in zip(train_X, train_y):
        dict[x[attr]].append(y)
    count_dict = {key:[value.count(k) for k in target_set] for key, value in dict.items()}
    prob_dict = {k:np.array(v)/sum(v) for k, v in count_dict.items()}
    
    entropy_cls = [entropy([prob for prob in prob_list if prob != 0])  for key, prob_list in prob_dict.items()]
    return sum(np.array(list(cls_prob_dict.values())) * np.array(entropy_cls))           

def info_gain(attributes, train_X, train_y):
    '''calculate the Information Gain of  a set of attributes'''
    cls_dict = collections.defaultdict(int)
    for attr, cls in zip(train_X, train_y):
        cls_dict[cls] += 1
    cls_preds = {k:v/len(train_y) for k, v in cls_dict.items()}
    
    #get information entropy
    H_cls = entropy(list(cls_preds.values()))
    
    #Computing conditional entropy
    H_cond = [cond_entropy(attr, train_X, train_y) for attr in attributes]
    return H_cls, np.around(H_cls - np.array(H_cond), 6)

#info_gain([0,1], cls_preds, cond_dict)

### test  on the 9 dataset

In [469]:
csv_files = ["anneal", "breast-cancer","car","cmc","hepatitis","hypothyroid","mushroom","nursery","primary-tumor"]
root = "2019S1-proj1-data/2019S1-proj1-data"
for file_name in csv_files:
    print("=======================     {}        ====================================".format(file_name))
    #train_X, train_y, val_X, val_y = preprocess(root + "/{}.csv".format(file_name))
    train_X, train_y, val_X, val_y = preprocess(root + "/{}.csv".format(file_name), seed=24)
    cls_preds, cond_dict = train(train_X, train_y)
    train_eval_ratio = evaluate(train_X, train_y, cls_preds, cond_dict)
    print("The evaluation ratio on train set is : ", train_eval_ratio)
    eval_ratio = evaluate(val_X, val_y, cls_preds, cond_dict)
    print("The evaluation ratio on val set is : ", eval_ratio)
    
    X, Y = np.vstack((train_X, val_X)), np.hstack((train_y,val_y))
    attrs = [i for i in range(train_X.shape[1])]
    H_cls, H_attrs = info_gain(attrs, X, Y)
    print("class Entropy, sum(all attrs info_gain) = {} , {}".format(H_cls, sum(H_attrs)))
    print("Info_gain of {} :\n{}".format(attrs,  H_attrs))

The shape of the inputdata:  (898, 36)
The calsses of the data:  {'2', '1', 'U', '3', '5'}
train : val =  718 : 180
The evaluation ratio on train set is :  0.8983
The evaluation ratio on val set is :  0.8889
class Entropy, sum(all attrs info_gain) = 1.1898338562043977 , 3.087583999999999
Info_gain of [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34] :
[0.40909  0.       0.306052 0.051344 0.291082 0.147119 0.213723 0.292235
 0.126166 0.141074 0.032488 0.435178 0.038702 0.000438 0.039356 0.021775
 0.037997 0.036703 0.       0.117225 0.029754 0.027042 0.       0.015605
 0.137181 0.       0.022397 0.018242 0.       0.       0.       0.04324
 0.033038 0.019379 0.003959]
The shape of the inputdata:  (286, 10)
The calsses of the data:  {'recurrence-events', 'no-recurrence-events'}
train : val =  228 : 58
The evaluation ratio on train set is :  0.7719
The evaluation ratio on val set is :  0.7069
class Entropy, sum

Questions (you may respond in a cell or cells below):

1. The Naive Bayes classifiers can be seen to vary, in terms of their effectiveness on the given datasets (e.g. in terms of Accuracy). Consider the Information Gain of each attribute, relative to the class distribution — does this help to explain the classifiers’ behaviour? Identify any results that are particularly surprising, and explain why they occur.
2. The Information Gain can be seen as a kind of correlation coefficient between a pair of attributes: when the gain is low, the attribute values are uncorrelated; when the gain is high, the attribute values are correlated. In supervised ML, we typically calculate the Infomation Gain between a single attribute and the class, but it can be calculated for any pair of attributes. Using the pair-wise IG as a proxy for attribute interdependence, in which cases are our NB assumptions violated? Describe any evidence (or indeed, lack of evidence) that this is has some effect on the effectiveness of the NB classifier.
3. Since we have gone to all of the effort of calculating Infomation Gain, we might as well use that as a criterion for building a “Decision Stump” (1-R classifier). How does the effectiveness of this classifier compare to Naive Bayes? Identify one or more cases where the effectiveness is notably different, and explain why.
4. Evaluating the model on the same data that we use to train the model is considered to be a major mistake in Machine Learning. Implement a hold–out or cross–validation evaluation strategy. How does your estimate of effectiveness change, compared to testing on the training data? Explain why. (The result might surprise you!)
5. Implement one of the advanced smoothing regimes (add-k, Good-Turing). Does changing the smoothing regime (or indeed, not smoothing at all) affect the effectiveness of the Naive Bayes classifier? Explain why, or why not.
6. Naive Bayes is said to elegantly handle missing attribute values. For the datasets with missing values, is there any evidence that the performance is different on the instances with missing values, compared to the instances where all of the values are present? Does it matter which, or how many values are missing? Would a imputation strategy have any effect on this?

Don't forget that groups of 1 student should respond to question (1), and one other question of your choosing. Groups of 2 students should respond to question (1) and question (2), and two other questions of your choosing. Your responses should be about 150-250 words each.

### Responses For Question1
   When calculating the Information Gain of a set of attributes, I find there are many 0 value or some values that are very close to 0, but some are relatively big.I begin to think what the Information Gain really mean. So I start to look up for some the information on the Internet and designed some experiments to explore the specific meaning of these 0 and how can it relative to classes.   
   There are the several experiments.I choose three dataset to do this experiment:mushroom.csv,anneal.csv and nursery.csv.Because the number of the items of the mushroom and nursery are very lage, which means reflecting the real situation better.And there are many 0-value info-gain
attributes in anneal.I remove the Minimum and maximum value info-gain col in the dataset,And then train again,to see the preformence in both train-set and validation-set.Here are the result:    
#####  experiments result
| dataset       |  count   | Original(train)   | Rm min(train) | Rm Max(train) |  Original(val)  | Rm min(val) | Rm Max (val)|
| ---------------- | :--------: |:--------: | ----: |   ----: | :--------: | ----: |   ----: |
| mushroom     | 8124 |0.9548 |    0.9548 |  0.8989 |  0.9495 |   0.9495 |  0.8942 |
| anneal | 898| 0.8955 |   0.8928 |  0.8426 | 0.8167 |   0.8167 |  0.8111 |
| nursery     | 12960  | 0.9043  |    0.902 |   0.5422 | 0.8989  |   0.892 |   0.5278 |

-------------------------------------------------------------------------------------
- When we remove the 0-value-info-gain attributes, the prefromenece will change a little or the same with the oridinary version.especificily on the train data. So the 0 value info-gain means that this attribute has mo relation with the calsss
- On the nursery dataset,the info-gain list is [0.070746,  0.197377,  0.005211,  0.01221,   0.019903,  0.004543,  0.021412,  0.95699 ], you can find the 8th(index=7) attribute's info-gain is much bigger others'.When remove this col, The correct rate drop from 0.9043 to 0.5422 on train set and from 0.8989	to 0.5278.  
- Therefore, the proportion of information gain in the information gain list reflects the relationship between the attribute and the category.
------------------------------------------------------------------------------
Below are three experiments,inculding code.


   

### mushroom dataset
the info-gain of 16th(index=15) attributes is 0, the info-gain of 16th(index=15) attributes is the max   
remove the two cols to see the change

In [466]:
file_name = "mushroom"
root = "2019S1-proj1-data/2019S1-proj1-data"
train_X, train_y, val_X, val_y = preprocess(root + "/{}.csv".format(file_name), seed=78)
attrs = [i for i in range(train_X.shape[1])]
H_cls, H_attrs = info_gain(attrs, train_X, train_y)
print("Info_gain of {} :\n{}".format(attrs,  H_attrs))
print()
print("--------------------      Original           -----------------------------")
cls_preds, cond_dict = train(train_X, train_y)
train_eval_ratio = evaluate(train_X, train_y, cls_preds, cond_dict)
print("The evaluation ratio on train set is : ", train_eval_ratio)
eval_ratio = evaluate(val_X, val_y, cls_preds, cond_dict)
print("The evaluation ratio on val set is : ", eval_ratio)

print("----------      remove the min info-Gain attributes  (0 or almost 0)  ----------------------")
train_X_min = np.delete(train_X, 15, axis=1)
val_X_min = np.delete(val_X, 15, axis=1)
cls_preds, cond_dict = train(train_X_min, train_y)
train_eval_ratio = evaluate(train_X_min, train_y, cls_preds, cond_dict)
print("The evaluation ratio on train set is : ", train_eval_ratio)
eval_ratio = evaluate(val_X_min, val_y, cls_preds, cond_dict)
print("The evaluation ratio on val set is : ", eval_ratio)

print("----------     remove the max info-Gain attributes    -------------------------")
train_X_max = np.delete(train_X, 4, axis=1)
val_X_max = np.delete(val_X, 4, axis=1)
cls_preds, cond_dict = train(train_X_max, train_y)
train_eval_ratio = evaluate(train_X_max, train_y, cls_preds, cond_dict)
print("The evaluation ratio on train set is : ", train_eval_ratio)
eval_ratio = evaluate(val_X_max, val_y, cls_preds, cond_dict)
print("The evaluation ratio on val set is : ", eval_ratio)


The shape of the inputdata:  (8124, 23)
The calsses of the data:  {'e', 'p'}
train : val =  6499 : 1625
Info_gain of [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] :
[0.049568 0.029968 0.039343 0.195216 0.907754 0.014447 0.099232 0.231177
 0.417177 0.007587 0.134764 0.286848 0.271637 0.253215 0.241543 0.
 0.024343 0.038278 0.324224 0.486742 0.202735 0.153975]

--------------------      Original           -----------------------------
The evaluation ratio on train set is :  0.9548
The evaluation ratio on val set is :  0.9495
----------      remove the min info-Gain attributes  (0 or almost 0)  ----------------------
The evaluation ratio on train set is :  0.9548
The evaluation ratio on val set is :  0.9495
----------     remove the max info-Gain attributes    -------------------------
The evaluation ratio on train set is :  0.8989
The evaluation ratio on val set is :  0.8942


### anneal dataset
the info-gain of 31th(index=30) attributes is 0, the info-gain of 12th(index=11) attributes is the max   
remove the two cols to see the change

In [467]:
file_name = "anneal"
root = "2019S1-proj1-data/2019S1-proj1-data"
train_X, train_y, val_X, val_y = preprocess(root + "/{}.csv".format(file_name), seed=78)
attrs = [i for i in range(train_X.shape[1])]
H_cls, H_attrs = info_gain(attrs, train_X, train_y)
print("Info_gain of {} :\n{}".format(attrs,  H_attrs))
print()
print("--------------------------    Original        ---------------------------------------------------")
cls_preds, cond_dict = train(train_X, train_y)
train_eval_ratio = evaluate(train_X, train_y, cls_preds, cond_dict)
print("The evaluation ratio on train set is : ", train_eval_ratio)
eval_ratio = evaluate(val_X, val_y, cls_preds, cond_dict)
print("The evaluation ratio on val set is : ", eval_ratio)

print("----------      remove the min info-Gain attributes (0 or almost 0)   ----------------------")
train_X_min = np.delete(train_X, 30, axis=1)
val_X_min = np.delete(val_X, 30, axis=1)
cls_preds, cond_dict = train(train_X_min, train_y)
train_eval_ratio = evaluate(train_X_min, train_y, cls_preds, cond_dict)
print("The evaluation ratio on train set is : ", train_eval_ratio)
eval_ratio = evaluate(val_X_min, val_y, cls_preds, cond_dict)
print("The evaluation ratio on val set is : ", eval_ratio)

print("----------     remove the max info-Gain attributes    ----------------------------------")
train_X_max = np.delete(train_X, 11, axis=1)
val_X_max = np.delete(val_X, 11, axis=1)
cls_preds, cond_dict = train(train_X_max, train_y)
train_eval_ratio = evaluate(train_X_max, train_y, cls_preds, cond_dict)
print("The evaluation ratio on train set is : ", train_eval_ratio)
eval_ratio = evaluate(val_X_max, val_y, cls_preds, cond_dict)
print("The evaluation ratio on val set is : ", eval_ratio)

The shape of the inputdata:  (898, 36)
The calsses of the data:  {'2', '1', 'U', '3', '5'}
train : val =  718 : 180
Info_gain of [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34] :
[0.410287 0.       0.300462 0.052846 0.28127  0.144065 0.237197 0.299193
 0.111863 0.132557 0.027349 0.456482 0.039277 0.000543 0.040342 0.022473
 0.040596 0.03946  0.       0.118099 0.03184  0.028071 0.       0.019241
 0.136755 0.       0.02301  0.019264 0.       0.       0.       0.041256
 0.02696  0.022722 0.003273]

--------------------------    Original        ---------------------------------------------------
The evaluation ratio on train set is :  0.8955
The evaluation ratio on val set is :  0.8167
----------      remove the min info-Gain attributes (0 or almost 0)   ----------------------
The evaluation ratio on train set is :  0.8928
The evaluation ratio on val set is :  0.8167
----------     remove the max info-Gain a

### nursery dataset
the info-gain of 3th(index=2) attributes is 0, the info-gain of 9th(index=8) attributes is the max   
remove the two cols to see the change

In [468]:
file_name = "nursery"
root = "2019S1-proj1-data/2019S1-proj1-data"
train_X, train_y, val_X, val_y = preprocess(root + "/{}.csv".format(file_name), seed=78)
attrs = [i for i in range(train_X.shape[1])]
H_cls, H_attrs = info_gain(attrs, train_X, train_y)
print("Info_gain of {} :\n{}".format(attrs,  H_attrs))
print()
print("------------------------      Original         -------------------------------------------")
cls_preds, cond_dict = train(train_X, train_y)
train_eval_ratio = evaluate(train_X, train_y, cls_preds, cond_dict)
print("The evaluation ratio on train set is : ", train_eval_ratio)
eval_ratio = evaluate(val_X, val_y, cls_preds, cond_dict)
print("The evaluation ratio on val set is : ", eval_ratio)

print("----------      remove the min info-Gain attributes  (0 or almost 0)  -----------------------")
train_X_min = np.delete(train_X, 2, axis=1)
val_X_min = np.delete(val_X, 2, axis=1)
cls_preds, cond_dict = train(train_X_min, train_y)
train_eval_ratio = evaluate(train_X_min, train_y, cls_preds, cond_dict)
print("The evaluation ratio on train set is : ", train_eval_ratio)
eval_ratio = evaluate(val_X_min, val_y, cls_preds, cond_dict)
print("The evaluation ratio on val set is : ", eval_ratio)

print("----------     remove the max info-Gain attributes    --------------------------------------------")
train_X_max = np.delete(train_X, -1, axis=1)
val_X_max = np.delete(val_X, -1, axis=1)
cls_preds, cond_dict = train(train_X_max, train_y)
train_eval_ratio = evaluate(train_X_max, train_y, cls_preds, cond_dict)
print("The evaluation ratio on train set is : ", train_eval_ratio)
eval_ratio = evaluate(val_X_max, val_y, cls_preds, cond_dict)
print("The evaluation ratio on val set is : ", eval_ratio)

The shape of the inputdata:  (12960, 9)
The calsses of the data:  {'very_recom', 'spec_prior', 'not_recom', 'recommend', 'priority'}
train : val =  10368 : 2592
Info_gain of [0, 1, 2, 3, 4, 5, 6, 7] :
[0.070746 0.197377 0.005211 0.01221  0.019903 0.004543 0.021412 0.95699 ]

------------------------      Original         -------------------------------------------
The evaluation ratio on train set is :  0.9043
The evaluation ratio on val set is :  0.8989
----------      remove the min info-Gain attributes  (0 or almost 0)  -----------------------
The evaluation ratio on train set is :  0.902
The evaluation ratio on val set is :  0.892
----------     remove the max info-Gain attributes    --------------------------------------------
The evaluation ratio on train set is :  0.5422
The evaluation ratio on val set is :  0.5278


### Responses For Question4
I have used the Hold-Out method when implementing the naive Bayesian algorithm in the fuction preprocess(data_path, split_ratio=0.2, seed=None).I divide the data set by default 4:, the 4 is the train set and the 1 is the validation set.the result are below.

| dataset       |  anneal  | breast-cancer  | car    | cmc |  hepatitis  | hypothyroid | mushroom|nursery|primary-tumor|
| ---------------- | :--------: |:--------:   | ----:  |   ----: | :--------: | ----: |   ----: |----: |----: |
| count         | 898    | 286     |  1728 |  1473 |  155 |   3163 |  8124 |  12960|  339 |
| train         | 0.8983    | 0.7719     |  0.8777 |  0.5102 |  0.8226 |   0.953 |  0.9566 |  0.9045 |  0.5978 |
| validation     | 0.8889    | 0.7069    |  0.8699 |  0.4746 | 0.871 |   0.9479 |  0.9625 | 0.9055 | 0.5147 |

#### Conclusion
- We can find that the more data we have, the smaller the error between the training set and the validation set. May be that the larger the data is , the more likely it is to react to the probability characteristics.
- I used a random seed which equals 24 to get the above table.When I use random seed, the result some dataset will cahnge a lot, especificly the dataset with little size data. The more a dataset is, the more steady the correct rate will be.
- Normally, the correct rate on the training set is higher than on the validation set.But Sometimes, the correct rate on the training set is a littlr lower than on the validation set. That is intersting.
The experiments are above,in the block of "test on the 9 dataset"
-------------------------------------------------------------
 I found information about K-fold Cross Validation online and implemented it below.K-CV method can make full use of the data!And I evaluate the car.csv. Its results are more credible and accurate.     
- CORRECT RATE ON TRAIN 0.87978     
- CORRECT RATE ON VAL 0.8570800000000001

In [479]:
# This function should open a data file in csv, and transform it into a usable format 
import csv
import numpy as np
def new_preprocess(data_path, split_ratio=0.2, seed=None, start=0):
    '''
    prpross the data from data_path --- a csv format csv file
    @data_path: the path of the csv format file
    @split_ratio: the ratio of the val data 
    '''
    #read data from csv
    with open(data_path, newline='') as csvfile:
        lines = csv.reader(csvfile, delimiter=',')  
        matrix = np.array(list(lines))
    classes = matrix[:,-1]
    
    length = matrix.shape[0]
    #shuffle the data by using the random seed or if seed == None random else use the seed th shuffle
    if seed != None:
        np.random.seed(seed)
    np.random.shuffle(matrix)
    matrix = np.vstack((matrix, matrix))
    #split the data into train and validation
    train, val = matrix[int(length*(start)):int(length*(start + 0.8)),:], matrix[int(length*(start + 0.8)):int(length*(start + 1)),:]
    print("train : val = ",str(len(train)) + " : " + str(len(val)))
    
    #split the attribues and the classes
    train_X, train_y = train[:,:-1], train[:,-1]
    val_X, val_y = val[:,:-1], val[:,-1]
    return train_X, train_y, val_X, val_y

In [480]:
file_name = "car"
root = "2019S1-proj1-data/2019S1-proj1-data"   
print("=======================     {}        ====================================".format(file_name))
train_eval_ratio_list, eval_ratio_list = [], [] 
for start in [0,0.2,0.4,0.6,0.8]:
    train_X, train_y, val_X, val_y = new_preprocess(root + "/{}.csv".format(file_name), start=start)
    cls_preds, cond_dict = train(train_X, train_y)
    train_eval_ratio = evaluate(train_X, train_y, cls_preds, cond_dict)
    print("The evaluation ratio on train set is : ", train_eval_ratio)
    train_eval_ratio_list.append(train_eval_ratio)
    eval_ratio = evaluate(val_X, val_y, cls_preds, cond_dict)
    print("The evaluation ratio on val set is : ", eval_ratio)
    eval_ratio_list.append(eval_ratio)
print("======================        SUMMARY    =========================")
print("CORRECT RATE ON TRAIN", sum(train_eval_ratio_list) / len(train_eval_ratio_list))
print("CORRECT RATE ON VAL", sum(eval_ratio_list) / len(eval_ratio_list))   


train : val =  1382 : 346
The evaluation ratio on train set is :  0.8661
The evaluation ratio on val set is :  0.8439
train : val =  1383 : 345
The evaluation ratio on train set is :  0.8648
The evaluation ratio on val set is :  0.8464
train : val =  1382 : 346
The evaluation ratio on train set is :  0.8763
The evaluation ratio on val set is :  0.8497
train : val =  1383 : 345
The evaluation ratio on train set is :  0.867
The evaluation ratio on val set is :  0.8754
train : val =  1382 : 346
The evaluation ratio on train set is :  0.8726
The evaluation ratio on val set is :  0.8699
CORRECT RATE ON TRAIN 0.86936
CORRECT RATE ON VAL 0.85706
