In [1]:
from src.data_preprocessor import DataProcessor
from src.cross_validation import CrossValidation
from src.evaluation import Evaluation
from models.knn import KNN
from models.null_model import NullModelClassification
from data_configs.configs import *
import numpy as np
import statistics


config = albalone_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)
knn_model = KNN(config)
null_model = NullModelClassification(config=config)

## Data Preprocessing ##

In [2]:
raw_data = data_processor.load_data()

In [3]:
raw_data

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


#### Impute Missing Values ####

In [4]:
data_1 = data_processor.impute_missing_values(raw_data)

In [5]:
null_values = raw_data.isnull().sum()
print(null_values)

Sex               0
Length            0
Diameter          0
Height            0
Whole weight      0
Shucked weight    0
Viscera weight    0
Shell weight      0
Rings             0
dtype: int64


In [6]:
null_values = data_1.isnull().sum()
print(null_values)

Sex               0
Length            0
Diameter          0
Height            0
Whole weight      0
Shucked weight    0
Viscera weight    0
Shell weight      0
Rings             0
dtype: int64


#### Encode Nominal and Ordinal Features ####

In [7]:
data_2 = data_processor.encode_nominal_features(data_1)
data_2

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,0,0,1
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,0,0,1
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,1,0,0
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,0,0,1
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
4172,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11,1,0,0
4173,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,0,0,1
4174,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,0,0,1
4175,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10,1,0,0


In [8]:
data_3 = data_processor.encode_ordinal_features(data_2)
data_3

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,0,0,1
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,0,0,1
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,1,0,0
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,0,0,1
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
4172,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11,1,0,0
4173,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,0,0,1
4174,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,0,0,1
4175,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10,1,0,0


In [9]:
# features = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
features=['Length', 'Diameter', 'Height', 'Whole weight','Shucked weight', 'Viscera weight', 'Shell weight']
data_standardized = data_processor.standardize_data(data_3,data_3, features=features)
data_standardized.describe()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,-5.919772e-16,-2.993908e-16,4.006053e-16,7.824986e-17,-5.103252e-18,2.687713e-16,2.857821e-16,9.933684,0.312904,0.321283,0.365813
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.224169,0.463731,0.467025,0.481715
min,-3.738706,-3.555842,-3.335554,-1.68589,-1.614537,-1.642976,-1.70493,1.0,0.0,0.0,0.0
25%,-0.6161238,-0.583246,-0.5861373,-0.7896632,-0.781065,-0.7945464,-0.7818159,8.0,0.0,0.0,0.0
50%,0.1749304,0.1724987,0.01156191,-0.05963053,-0.1052765,-0.08752154,-0.03470378,9.0,0.0,0.0,0.0
75%,0.7578124,0.7267114,0.6092612,0.6612257,0.642596,0.6605564,0.6477544,11.0,1.0,1.0,1.0
max,2.42319,2.439733,23.68045,4.071783,5.084779,5.285867,5.503983,29.0,1.0,1.0,1.0


## Cross Validation ##

In [10]:
data_train, data_val = cross_validator.random_partition(data_3,val_size=0.2)

In [11]:
print(len(data_train))
(len(data_val))

3341


836

In [12]:
for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):

    print(train_set)
    print(test_set)

      Length  Diameter  Height  Whole weight  Shucked weight  Viscera weight  \
1918   0.600     0.465   0.160        1.1330          0.4660          0.2885   
3326   0.550     0.415   0.135        0.8095          0.2985          0.2015   
2761   0.550     0.425   0.145        0.8900          0.4325          0.1710   
2201   0.645     0.490   0.215        1.4060          0.4265          0.2285   
3233   0.610     0.480   0.190        1.2955          0.5215          0.3225   
...      ...       ...     ...           ...             ...             ...   
1470   0.530     0.430   0.130        0.7045          0.3460          0.1415   
1060   0.265     0.195   0.055        0.0840          0.0365          0.0175   
576    0.560     0.425   0.140        0.9175          0.4005          0.1975   
2816   0.325     0.240   0.075        0.1525          0.0720          0.0645   
809    0.520     0.410   0.115        0.8070          0.2855          0.1790   

      Shell weight  Rings  Sex_F  Sex_I



## Training ##

#### Distance Calculation ####

In [13]:
data_val_point = data_val.iloc[1].values
data_val_point

array([ 0.63 ,  0.505,  0.155,  1.105,  0.492,  0.226,  0.325, 11.   ,
        0.   ,  0.   ,  1.   ])

In [14]:
data_train_points = data_train.values
data_train_points

array([[0.57 , 0.45 , 0.15 , ..., 1.   , 0.   , 0.   ],
       [0.6  , 0.465, 0.16 , ..., 1.   , 0.   , 0.   ],
       [0.55 , 0.415, 0.135, ..., 0.   , 1.   , 0.   ],
       ...,
       [0.58 , 0.49 , 0.195, ..., 0.   , 1.   , 0.   ],
       [0.635, 0.49 , 0.17 , ..., 1.   , 0.   , 0.   ],
       [0.565, 0.435, 0.185, ..., 0.   , 0.   , 1.   ]])

In [15]:
print(knn_model.calc_distance(data_train_points, data_val_point))

[2.45819776 1.417258   1.77264992 ... 7.14552647 2.45597124 9.00194897]


#### Nearest Neighbors ####

In [16]:
knn_model.k_nearest_neighbors(data_val_point,data_train,k=5)

[(10.000052762360806, 11),
 (10.000054424851896, 14),
 (10.000070062254563, 10),
 (10.000079612183095, 10),
 (10.000093724560786, 11)]

#### KNN Classification ####

In [17]:
knn_model.knn_classifier(test_set,train_set,k=5)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,Class,Predicted Class
1317,3,0,0,2,1,0,unacc,unacc
867,2,0,0,0,1,0,unacc,unacc
250,0,2,1,0,2,1,unacc,unacc
796,1,3,1,1,1,1,unacc,acc
281,0,2,2,1,0,2,acc,acc
...,...,...,...,...,...,...,...,...
374,0,3,1,2,1,2,acc,acc
420,0,3,3,1,2,0,unacc,unacc
706,1,2,2,0,1,1,unacc,unacc
814,1,3,2,0,1,1,unacc,unacc


In [17]:
gamma = 1/(statistics.stdev(data_train[config['target_column']]))
knn_model.knn_regression(test_set,train_set,k=5,gamma=gamma)

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M,Predicted Value
4114,0.570,0.450,0.150,0.9645,0.5310,0.1890,0.209,9,1,0,0,8.600107
2201,0.645,0.490,0.215,1.4060,0.4265,0.2285,0.510,25,1,0,0,15.598192
2122,0.435,0.350,0.120,0.4585,0.1920,0.1000,0.130,11,1,0,0,10.199897
3844,0.570,0.450,0.155,0.9100,0.3260,0.1895,0.355,14,0,0,1,14.799093
2556,0.380,0.285,0.085,0.2370,0.1150,0.0405,0.070,6,0,1,0,6.800019
...,...,...,...,...,...,...,...,...,...,...,...,...
908,0.335,0.250,0.075,0.1825,0.0705,0.0440,0.055,7,0,1,0,6.400003
1060,0.265,0.195,0.055,0.0840,0.0365,0.0175,0.025,7,0,1,0,5.200004
576,0.560,0.425,0.140,0.9175,0.4005,0.1975,0.260,10,0,1,0,10.600757
809,0.520,0.410,0.115,0.8070,0.2855,0.1790,0.235,12,1,0,0,10.999904


## Edited KNN ##

In [None]:
knn_model.edited_knn_classification(train_set,test_set,k=1)

In [18]:
epsilon = statistics.stdev(data_3[config['target_column']])
knn_model.edited_knn_regression(train_set, test_set, epsilon=epsilon,gamma=gamma)

Editing training set...
MSE improved from inf to 7.025748502994012. Conitnuing...
Editing training set...
MSE improved from 7.025748502994012 to 6.9640718562874255. Conitnuing...
Editing training set...
Zero-one loss degraded from 6.9640718562874255 to 6.9640718562874255. Stopping editing. 


Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
1918,0.600,0.465,0.160,1.1330,0.4660,0.2885,0.298,11,1,0,0
3326,0.550,0.415,0.135,0.8095,0.2985,0.2015,0.280,12,0,1,0
2761,0.550,0.425,0.145,0.8900,0.4325,0.1710,0.236,10,0,1,0
3233,0.610,0.480,0.190,1.2955,0.5215,0.3225,0.365,12,1,0,0
2185,0.445,0.340,0.120,0.4475,0.1930,0.1035,0.130,9,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
1470,0.530,0.430,0.130,0.7045,0.3460,0.1415,0.189,9,0,1,0
3492,0.555,0.440,0.145,0.8500,0.4165,0.1685,0.230,8,0,0,1
2816,0.325,0.240,0.075,0.1525,0.0720,0.0645,0.043,6,0,1,0
2706,0.725,0.530,0.190,1.7315,0.8300,0.3980,0.405,11,1,0,0
