In [None]:
import pyforest

def import_data():
    data = pd.read_csv("datasets/iris/iris.data", sep=",", header=None)
    data = data.values
    '''df.values returns a numpy array with the underlying data of the DataFrame, without any index or column names
    '''
    data[:,4] = np.where(data[:,4]=='Iris-setosa', 1, np.where(data[:,4]=='Iris-versicolor', 2, 3))
    '''[:, 4] is a slice of that array, returning all rows of the 5th column
    '''
    return data

def mean_std_col(data):
    mean = data.mean(0)   # 0:= column-wise
    sd = data.std(0)
    return mean, sd

def class_dist(data, num_classes):
    for i in range(num_classes):
        placeholder[i] = np.count_nonzero(data == i+1)
        
def hist_plot(name, data):
    plt.figure()
    plt.hist(data)
    plt.show()
    
def data_split_random(data_X, data_y, portion):
    order = np.random.permutation(len(data_X))   # 1
    index = int(portion * len(data_X))   # 2
    X_train = data_X[order[:index]]
    X_test = data_X[order[index:]]
    y_train = data_y[order[:index]]
    y_test = data_y[order[index:]]
    return X_train, X_test, y_train, y_test

def data_split_nonrandom(data_X, data_y, portion):
    index = int(portion * len(data_X))
    X_train = data_X[:index]
    X_test = data_X[index:]
    y_train = data_y[:index]
    y_test = data_y[index:]
    return X_train, X_test, y_train, y_test

def feature_subset(feature, X_train):
    X_train_sub = X_train[:, feature]
    return X_train_sub

def binary_class(data):
    subset = data[data[:, 4] < 3]
    subset_X = subset[:, :4].astype(float)
    '''[:, :4] is a slice of array, returning all rows and every column except the 5th
    '''
    subset_y = subset[:, 4]
    sub_X_train, sub_X_test, sub_y_train, sub_y_test = data_split_random(subset_X, subset_y, 0.6)
    sub_train = np.hstack((sub_X_train, sub_y_train[:, np.newaxis]))
    '''
    np.hstack to add a column
    numpy.newaxis to increase the dimension of the existing array - convert array to vector/matrix
    '''
    sub_test = np.hstack((sub_X_test, sub_y_test[:, np.newaxis]))
    np.savetxt('datasets/iris/binary_iristrain.txt', sub_train)
    np.savetxt('datasets/iris/binary_iristest.txt', sub_test)
    return sub_train, sub_test

def minmaxnormal(data_X):
    min_X = data_X.min(0)   # column mean
    max_X = data_X.max(0)
    data_scaled = (data_X - min_X) / (max_X - min_X)
    return data_scaled

def main():
    
    ### Q1 convert the class labels to 1, 2, and 3
    
    iris = import_data()
    iris_X = iris[:, :4].astype(float)
    iris_y = iris[:, 4]
    
    ### Q2 report mean and standard deviation for each column in the features (4 features)
    
    mean_X, sd_X = mean_sd_col(iris_X)
    print('Column means are ', mean_X)
    print('Column standard deviations are ', sd_X)
    
    ### Q3 report the class distribution (i.e. number of instances for each class)
    
    num_classes = 3
    placeholder = [None] * num_classes   # initiate an empty list with 3 elements, [None, None, None]
        
    class_dist(iris_y, num_classes)
    
    print(f"{placeholder[0]} Iris_setosa")
    print(f"{placeholder[1]} Iris_versicolor")
    print(f"{placeholder[2]} Iris_virginica")
    
    ### Q4 show histogram for each feature
    
    for i in range(len(iris_X[0])):   # [0] := 1st row
        name = 'figure' + str(i)
        data = iris_X[:, i]
        hist_plot(name, data)
        
    ### Q5 split data into a train and test
    
    protion = 0.6
    
    # i) random
    r_X_train, r_X_test, r_y_train, r_y_test = data_split_random(iris_X, iris_y, portion)
    
    # ii) deterministic
    d_X_train, d_X_test, d_y_train, d_y_test = data_split_nonrandom(iris_X, iris_y, portion)
    
    ### Q6 repeat Q2-Q5 with random set
    
    mean_train, sd_train = mean_sd_col(r_X_train)
    print("Column means for train: ", mean_train)
    print("Xolumn standard deviations for train: ", sd_train)
    
    mean_test, sd_test = mean_sd_col(r_X_test)
    print("Column means for test: ", mean_test)
    print("Column standard deviations for test: ", sd_test)
    
    class_dist(r_y_train, num_classes)
    
    print(f"There are {placeholder[0]} Iris_setosa samples in the Train set")
    print(f"There are {placeholder[1]} Iris_versicolor samples in the Train set")
    print(f"There are {placeholder[2]} Iris_virginica samples in the Train set")
    
    class_dist(r_y_test, num_classes)
    
    print(f"There are {placeholder[0]} Iris_setosa samples in the Test set")
    print(f"There are {placeholder[1]} Iris_versicolor samples in the Test set")
    print(f"There are {placeholder[2]} Iris_virginica samples in the Test set")
    
    for i in range(len(r_x_train[0])):
        name = 'train figure'+str(i)
        data = r_x_train[:,i]
        hist_plot(name,data)
        
    for i in range(len(r_x_test[0])):
        name = 'test figure'+str(i)
        data = r_x_test[:,i]
        hist_plot(name,data)
    
    ### Q7 create a dataset with any one of the 4 features
    
    subset_1 = feature_subset(1, r_x_train)[:,np.newaxis]
    
    '''Q8
    reate a subset of the dataset where you consider only instances that feature class 1 or 2,
    treat this problem as a binary classification problem later
    '''
    sub_train, sub_test = binary_class(iris)
    sub_x_train = sub_train[:,:4].astype(float)
    sub_y_train = sub_train[:, 4]
    sub_x_test = sub_test[:,:4].astype(float)
    sub_y_test = sub_test[:, 4]    
    
    mean_train, sd_train = mean_sd_col(sub_x_train)
    print('Column Means for train are ', mean_train)
    print('Column Standard Deviations for train are ', sd_train)
    
    mean_test, sd_test = mean_sd_col(sub_x_test)
    print('Column Means for test are ', mean_test)
    print('Column Standard Deviations for test are ', sd_test)
    
    class_dist(sub_y_train, num_classes)
    
    print(f"There are {placeholder[0]} Iris_setosa samples in the Train set")
    print(f"There are {placeholder[1]} Iris_versicolor samples in the Train set")
    print(f"There are {placeholder[2]} Iris_virginica samples in the Train set")
    
    class_dist(sub_y_test, num_classes)
    
    print(f"There are {placeholder[0]} Iris_setosa samples in the Test set")
    print(f"There are {placeholder[1]} Iris_versicolor samples in the Test set")
    print(f"There are {placeholder[2]} Iris_virginica samples in the Test set")
    
    for i in range(len(sub_x_train[0])):
        name = 'train sub figure'+str(i)
        data = sub_x_train[:,i]
        hist_plot(name,data)
        
    for i in range(len(sub_x_test[0])):
        name = 'test sub figure'+str(i)
        data = sub_x_test[:,i]
        hist_plot(name,data)
        
    ### Q9 normalise the input features between [0 and 1]
    
    normal_x = minmaxnormal(iris_x)
    
if __name__ == '__main__':
    main()
    

In [8]:
# 1
order = np.random.permutation(20)
order

<IPython.core.display.Javascript object>

array([ 3, 17,  7, 10,  5,  0,  2,  1,  6, 18, 19, 15, 13, 12, 16,  8,  9,
       14, 11,  4])

In [3]:
# 2
int(0.4 * 20)

8

In [9]:
order[:8]

array([ 3, 17,  7, 10,  5,  0,  2,  1])

In [15]:
[None] * 3

[None, None, None]

In [11]:
data = pd.read_csv("datasets/iris/iris.data", sep=",", header=None)
data.head()

<IPython.core.display.Javascript object>

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [19]:
data[0]

array([5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], dtype=object)