In [1]:
import numpy as np
import pickle
import random
import time

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Hyperparmeter

In [2]:
n_estimator = 50

# Dataloader

In [3]:
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

## Cifar 10

In [4]:
path = '/home/solang/CW/CW4/data/cifar-10-batches-py/'

In [5]:
datalist = ['data_batch_1', 'data_batch_2', 'data_batch_3', 'data_batch_4', 'data_batch_5']

data = np.array([np.array(unpickle(path+i)[b'data'], dtype = np.float32)
                 for i in datalist
])
label = np.array([np.array(unpickle(path+i)[b'labels'], dtype = np.int)
                 for i in datalist
])
data = data.reshape(50000, 3072)
label = label.reshape(50000,)

In [6]:
np.random.seed(100)
idx = np.random.choice(50000, 20000, replace=False)

In [7]:
x_train_10 = data[idx]
y_train_10 = label[idx]

In [8]:
x_test_10 = np.array([unpickle(path+'test_batch')[b'data']], dtype = np.float32)
y_test_10 = np.array([unpickle(path+'test_batch')[b'labels']], dtype = np.int)


x_test_10 = x_test_10.reshape(10000, 3072)
y_test_10 = y_test_10.reshape(10000,)

## Cifar 100

In [9]:
path = '/home/solang/CW/CW4/data/cifar-100-python/'

In [10]:
x_train_100 = np.array([unpickle(path+'train')[b'data']], dtype = np.float32)
y_train_100 = np.array([unpickle(path+'train')[b'fine_labels']], dtype = np.int)


x_train_100 = x_train_100.reshape(50000, 3072)
y_train_100 = y_train_100.reshape(50000,)

In [11]:
np.random.seed(100)
idx = np.random.choice(50000, 20000, replace=False)

In [12]:
x_train_100 = x_train_100[idx]
y_train_100 = y_train_100[idx]

In [13]:
x_test_100 = np.array([unpickle(path+'test')[b'data']], dtype = np.float32)
y_test_100 = np.array([unpickle(path+'test')[b'fine_labels']], dtype = np.int)


x_test_100 = x_test_100.reshape(10000, 3072)
y_test_100 = y_test_100.reshape(10000,)

# Train

## cifar 10

In [14]:
rf = RandomForestClassifier(n_estimators=50, random_state=0)

In [15]:
start = time.time()
rf.fit(x_train_10, y_train_10)
fin = time.time()

In [16]:
runtime = fin - start
print("runtime : ", runtime, "(sec)")

runtime :  32.482264041900635 (sec)


In [17]:
print("RF model score is ", rf.score(x_test_10, y_test_10)*100, "%")

RF model score is  42.19 %


## cifar100

In [18]:
rf100 = RandomForestClassifier(n_estimators=50, random_state=0)

In [19]:
start100 = time.time()
rf100.fit(x_train_100, y_train_100)
fin100 = time.time()

In [20]:
runtime100 = fin100 - start100
print("runtime : ", runtime100, "(sec)")

runtime :  105.45937752723694 (sec)


In [21]:
print("RF model score is ", rf100.score(x_test_100, y_test_100)*100, "%")

RF model score is  16.7 %


# Task 2 - n_estimator 변경

## cifar 10

In [147]:
n25 = RandomForestClassifier(n_estimators=25, random_state=0)
n75 = RandomForestClassifier(n_estimators=75, random_state=0)
n100 = RandomForestClassifier(n_estimators=100, random_state=0)

In [148]:
start25 = time.time()
n25.fit(x_train_10, y_train_10)
fin25 = time.time()

In [149]:
start75 = time.time()
n75.fit(x_train_10, y_train_10)
fin75 = time.time()

In [150]:
start100 = time.time()
n100.fit(x_train_10, y_train_10)
fin100 = time.time()

In [151]:
rt25 = fin25 - start25
rt50 = runtime
rt75 = fin75 - start75
rt100 = fin100 -start100

print("n25, runtime : ", rt25, "(sec)")
print("n50, runtime : ", rt50, "(sec)")
print("n75, runtime : ", rt75, "(sec)")
print("n100, runtime : ", rt100, "(sec)")

n25, runtime :  15.953176736831665 (sec)
n50, runtime :  39.81969237327576 (sec)
n75, runtime :  47.35764193534851 (sec)
n100, runtime :  62.21765398979187 (sec)


In [152]:
print("n25, RF model score is ", n25.score(x_test_10, y_test_10)*100, "%")
print("n50, RF model score is ", rf.score(x_test_10, y_test_10)*100, "%")
print("n75, RF model score is ", n75.score(x_test_10, y_test_10)*100, "%")
print("n100, RF model score is ", n100.score(x_test_10, y_test_10)*100, "%")

n25, RF model score is  39.07 %
n50, RF model score is  42.19 %
n75, RF model score is  43.3 %
n100, RF model score is  44.42 %


## cifar 100

In [153]:
n25_2 = RandomForestClassifier(n_estimators=25, random_state=0)
n75_2 = RandomForestClassifier(n_estimators=75, random_state=0)
n100_2 = RandomForestClassifier(n_estimators=100, random_state=0)

In [154]:
start25_2 = time.time()
n25_2.fit(x_train_100, y_train_100)
fin25_2 = time.time()

In [155]:
start75_2 = time.time()
n75_2.fit(x_train_100, y_train_100)
fin75_2 = time.time()

In [156]:
start100_2 = time.time()
n100_2.fit(x_train_100, y_train_100)
fin100_2 = time.time()

In [157]:
rt25_2 = fin25_2 - start25_2
rt50_2 = runtime100
rt75_2 = fin75_2 - start75_2
rt100_2 = fin100_2 -start100_2

print("n25, runtime : ", rt25_2, "(sec)")
print("n50, runtime : ", rt50_2, "(sec)")
print("n75, runtime : ", rt75_2, "(sec)")
print("n100, runtime : ", rt100_2, "(sec)")

n25, runtime :  51.69000601768494 (sec)
n50, runtime :  122.42826867103577 (sec)
n75, runtime :  171.42675161361694 (sec)
n100, runtime :  214.25235772132874 (sec)


In [158]:
print("n25, RF model score is ", n25_2.score(x_test_100, y_test_100)*100, "%")
print("n50, RF model score is ", rf100.score(x_test_100, y_test_100)*100, "%")
print("n75, RF model score is ", n75_2.score(x_test_100, y_test_100)*100, "%")
print("n100, RF model score is ", n100_2.score(x_test_100, y_test_100)*100, "%")

n25, RF model score is  14.01 %
n50, RF model score is  16.7 %
n75, RF model score is  17.86 %
n100, RF model score is  18.72 %


# Task 2 - data #

In [56]:
n = 50000

In [57]:
path = '/home/solang/CW/CW4/data/cifar-10-batches-py/'

datalist = ['data_batch_1', 'data_batch_2', 'data_batch_3', 'data_batch_4', 'data_batch_5']

data = np.array([np.array(unpickle(path+i)[b'data'], dtype = np.float32)
                 for i in datalist
])
label = np.array([np.array(unpickle(path+i)[b'labels'], dtype = np.int)
                 for i in datalist
])
data = data.reshape(50000, 3072)
label = label.reshape(50000,)

In [58]:

np.random.seed(100)
idx = np.random.choice(50000, n, replace=False)

In [59]:
x_train_10 = data[idx]
y_train_10 = label[idx]

In [60]:

x_test_10 = np.array([unpickle(path+'test_batch')[b'data']], dtype = np.float32)
y_test_10 = np.array([unpickle(path+'test_batch')[b'labels']], dtype = np.int)


x_test_10 = x_test_10.reshape(10000, 3072)
y_test_10 = y_test_10.reshape(10000,)

In [61]:
rf10 = RandomForestClassifier(n_estimators=50, random_state=0)
start10 = time.time()
rf10.fit(x_train_10, y_train_10)
fin10 = time.time()

runtime10 = fin10 - start10
print("runtime : ", runtime10, "(sec)")
print("RF model score is ", rf10.score(x_test_10, y_test_10)*100, "%")

runtime :  85.51582074165344 (sec)
RF model score is  44.09 %


# Task 2 - SVM, decision tree

## SVM - cifar10

In [159]:
sv = SVC(kernel='linear')

In [160]:
start_svm = time.time()
sv.fit(x_train_10, y_train_10)
fin_svm = time.time()

KeyboardInterrupt: 

In [None]:
rt_svm = fin_svm - start_svm
print("SVM, runtime : ", rt_svm, "(sec)")

In [None]:
print("SVM model score is ", sv.score(x_test_10, y_test_10)*100, "%")

## SVM - cifar100

In [None]:
sv2 = SVC(kernel='linear')

In [None]:
start_svm2 = time.time()
sv2.fit(x_train_100, y_train_100)
fin_svm2 = time.time()

In [None]:
rt_svm2 = fin_svm2 - start_svm2
print("SVM, runtime : ", rt_svm2, "(sec)")

In [None]:
print("SVM model score is ", sv2.score(x_test_100, y_test_100)*100, "%")

## Decision Tree - cifar10

In [None]:
dt = DecisionTreeClassifier(random_state=0)

In [None]:
start_dt = time.time()
dt.fit(x_train_10, y_train_10)
fin_dt = time.time()

In [None]:
rt_dt = fin_dt - start_dt
print("Decision Tree, runtime : ", rt_dt, "(sec)")

In [None]:
print("Decision Tree model score is ", dt.score(x_test_10, y_test_10)*100, "%")

## Decision Tree - cifar100

In [None]:
dt2 = DecisionTreeClassifier(random_state=0)

In [None]:
start_dt2 = time.time()
dt2.fit(x_train_100, y_train_100)
fin_dt2 = time.time()

In [None]:
rt_dt2 = fin_dt2 - start_dt2
print("Decision Tree, runtime : ", rt_dt2, "(sec)")

In [None]:
print("Decision Tree model score is ", dt2.score(x_test_100, y_test_100)*100, "%")