#Reduced Dataset

## python imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt  # To visualize
import pandas as pd  # To read data
from sklearn import linear_model, metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [20]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

seed = 13
kfold = KFold(n_splits=3, shuffle=True, random_state=seed)

def bestParams(algorithm, hp_candidates):
  grid = GridSearchCV(estimator=algorithm, param_grid=hp_candidates, cv=kfold, scoring='r2')
  grid.fit(X_train, y_train)
  return grid

##Mount Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Reading Data

In [3]:
data = pd.read_csv('/content/drive/MyDrive/AV_V/all_and_classification.csv')  # load data set


data = data.drop('run', axis=1)
data['forks'] = data['forks'].replace(['steady state'], 1)
data['forks'] = data['forks'].replace(['no steady state'], 0)
data = data.loc[data['forks'] == 1]
data = data.drop('forks', axis=1)
data = data.loc[data['steady_state_starts']<= 625]
data


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2992,2993,2994,2995,2996,2997,2998,2999,steady_state_starts,file_name
1,0.188219,0.075891,0.073269,0.073335,0.073138,0.074449,0.104530,0.079888,0.071762,0.072090,...,0.073138,0.073204,0.073859,0.073204,0.073466,0.073269,0.073400,0.073597,480,eclipse__eclipse-collections#org.eclipse.colle...
2,0.190317,0.075170,0.072417,0.072221,0.073073,0.073531,0.113967,0.096076,0.088605,0.087949,...,0.076218,0.074908,0.074908,0.074842,0.073925,0.075497,0.073925,0.074514,421,eclipse__eclipse-collections#org.eclipse.colle...
4,0.185860,0.074908,0.072155,0.072942,0.072810,0.074252,0.104530,0.095683,0.087753,0.085721,...,0.072745,0.076349,0.072745,0.072614,0.074056,0.072417,0.071959,0.073925,385,eclipse__eclipse-collections#org.eclipse.colle...
6,0.191103,0.077136,0.074187,0.073531,0.072614,0.074121,0.117703,0.094700,0.088146,0.085656,...,0.076808,0.076481,0.073794,0.075366,0.073662,0.074908,0.074187,0.073531,408,eclipse__eclipse-collections#org.eclipse.colle...
7,0.188219,0.074646,0.072876,0.073073,0.073925,0.074252,0.090472,0.094634,0.087359,0.086114,...,0.074187,0.074514,0.075104,0.075104,0.074514,0.075563,0.074711,0.074646,433,eclipse__eclipse-collections#org.eclipse.colle...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5848,0.000042,0.000024,0.000031,0.000031,0.000031,0.000031,0.000031,0.000031,0.000031,0.000031,...,0.000017,0.000017,0.000018,0.000018,0.000018,0.000018,0.000018,0.000018,53,apache__logging-log4j2#org.apache.logging.log4...
5849,0.000045,0.000023,0.000031,0.000030,0.000030,0.000030,0.000030,0.000030,0.000030,0.000030,...,0.000017,0.000017,0.000017,0.000018,0.000017,0.000017,0.000017,0.000017,321,apache__logging-log4j2#org.apache.logging.log4...
5851,0.001629,0.002516,0.002812,0.002784,0.002766,0.002760,0.003237,0.001095,0.001196,0.001301,...,0.001093,0.001142,0.001118,0.001779,0.002299,0.001127,0.001089,0.001095,395,yellowstonegames__SquidLib#squidpony.performan...
5855,0.001660,0.002597,0.002875,0.002770,0.002834,0.002764,0.002709,0.001095,0.001221,0.001445,...,0.001089,0.001090,0.001092,0.001093,0.001092,0.001106,0.001109,0.001106,190,yellowstonegames__SquidLib#squidpony.performan...


##Dividing data (train and test) based on file name 

In [4]:
data_file_name = data['file_name']
data_file_name

# data = data.drop('file_name', axis=1)


1       eclipse__eclipse-collections#org.eclipse.colle...
2       eclipse__eclipse-collections#org.eclipse.colle...
4       eclipse__eclipse-collections#org.eclipse.colle...
6       eclipse__eclipse-collections#org.eclipse.colle...
7       eclipse__eclipse-collections#org.eclipse.colle...
                              ...                        
5848    apache__logging-log4j2#org.apache.logging.log4...
5849    apache__logging-log4j2#org.apache.logging.log4...
5851    yellowstonegames__SquidLib#squidpony.performan...
5855    yellowstonegames__SquidLib#squidpony.performan...
5856    yellowstonegames__SquidLib#squidpony.performan...
Name: file_name, Length: 4234, dtype: object

In [5]:
data_file_name_unique = data_file_name.drop_duplicates(keep="first")

In [6]:
from sklearn.model_selection import train_test_split
data_file_name_train, data_file_name_test = train_test_split(data_file_name_unique, test_size=0.3,
                                                    random_state=1)

In [7]:
data_file_name_train 

5090    r2dbc__r2dbc-h2#io.r2dbc.h2.StagedResultSizeBe...
1250    apache__logging-log4j2#org.apache.logging.log4...
4260    apache__tinkerpop#org.apache.tinkerpop.gremlin...
5790    imglib__imglib2#net.imglib2.loops.SyncedPositi...
3750    hazelcast__hazelcast#com.hazelcast.internal.ut...
                              ...                        
1330    openzipkin__zipkin#zipkin2.internal.ReadBuffer...
1481    eclipse-vertx__vert.x#io.vertx.benchmarks.Json...
730     apache__logging-log4j2#org.apache.logging.log4...
2470    netty__netty#io.netty.handler.codec.mqtt.MqttC...
371     protostuff__protostuff#io.protostuff.benchmark...
Name: file_name, Length: 392, dtype: object

In [8]:
data_file_name_test

3780    yellowstonegames__SquidLib#squidpony.performan...
5429    prestodb__presto#com.facebook.presto.operator....
3660    crate__crate#io.crate.execution.engine.aggrega...
5110    imglib__imglib2#net.imglib2.loops.SyncedPositi...
5001    ReactiveX__RxJava#io.reactivex.rxjava3.core.Fl...
                              ...                        
5310    apache__arrow#org.apache.arrow.vector.Variable...
3900    apache__logging-log4j2#org.apache.logging.log4...
3230    r2dbc__r2dbc-h2#io.r2dbc.h2.StatementBenchmark...
3271    prestodb__presto#com.facebook.presto.geospatia...
181     apache__arrow#org.apache.arrow.adapter.jdbc.Jd...
Name: file_name, Length: 168, dtype: object

In [9]:
train_data = pd.merge(data, data_file_name_train, left_on='file_name', right_on='file_name')
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2992,2993,2994,2995,2996,2997,2998,2999,steady_state_starts,file_name
0,1.275195e-07,9.92695e-08,6.811809e-08,6.398774e-08,6.178975e-08,5.95337e-08,5.963519e-08,6.025651e-08,5.934733e-08,6.251917e-08,...,6.158755e-08,6.190392e-08,6.259095e-08,6.171464e-08,6.165376e-08,6.169033e-08,6.183229e-08,6.294976e-08,2,JCTools__JCTools#org.jctools.channels.spsc.Sps...
1,1.35583e-07,8.028709e-08,7.046209e-08,6.442911e-08,6.405469e-08,6.284861e-08,6.377896e-08,6.720911e-08,6.983024e-08,6.887329e-08,...,6.338043e-08,6.344368e-08,6.281152e-08,6.257241e-08,6.313669e-08,6.334688e-08,6.329215e-08,6.335572e-08,1,JCTools__JCTools#org.jctools.channels.spsc.Sps...
2,2.332804e-07,7.014395e-08,6.537276e-08,6.246478e-08,6.28974e-08,6.446362e-08,6.377569e-08,6.242375e-08,6.24037e-08,6.766667e-08,...,6.821299e-08,6.777635e-08,6.819488e-08,7.000788e-08,6.753254e-08,6.754947e-08,7.025719e-08,6.747891e-08,1,JCTools__JCTools#org.jctools.channels.spsc.Sps...
3,1.29803e-07,7.878544e-08,7.095477e-08,6.755758e-08,6.43872e-08,6.28883e-08,6.709468e-08,6.686679e-08,6.531514e-08,6.705712e-08,...,6.661401e-08,6.985692e-08,6.698712e-08,6.670719e-08,6.695537e-08,6.678397e-08,6.67775e-08,6.69956e-08,1,JCTools__JCTools#org.jctools.channels.spsc.Sps...
4,6.84424e-05,7.579114e-05,5.004239e-05,4.391429e-05,3.679732e-05,3.567909e-05,2.911061e-05,4.205884e-05,5.189324e-05,5.188036e-05,...,3.142454e-05,3.141652e-05,3.142139e-05,3.255179e-05,3.147647e-05,3.148359e-05,3.159481e-05,3.147299e-05,337,apache__camel#org.apache.camel.converter.Conve...


In [10]:
test_data = pd.merge(data, data_file_name_test, left_on='file_name', right_on='file_name')
test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2992,2993,2994,2995,2996,2997,2998,2999,steady_state_starts,file_name
0,0.188219,0.075891,0.073269,0.073335,0.073138,0.074449,0.10453,0.079888,0.071762,0.07209,...,0.073138,0.073204,0.073859,0.073204,0.073466,0.073269,0.0734,0.073597,480,eclipse__eclipse-collections#org.eclipse.colle...
1,0.190317,0.07517,0.072417,0.072221,0.073073,0.073531,0.113967,0.096076,0.088605,0.087949,...,0.076218,0.074908,0.074908,0.074842,0.073925,0.075497,0.073925,0.074514,421,eclipse__eclipse-collections#org.eclipse.colle...
2,0.18586,0.074908,0.072155,0.072942,0.07281,0.074252,0.10453,0.095683,0.087753,0.085721,...,0.072745,0.076349,0.072745,0.072614,0.074056,0.072417,0.071959,0.073925,385,eclipse__eclipse-collections#org.eclipse.colle...
3,0.191103,0.077136,0.074187,0.073531,0.072614,0.074121,0.117703,0.0947,0.088146,0.085656,...,0.076808,0.076481,0.073794,0.075366,0.073662,0.074908,0.074187,0.073531,408,eclipse__eclipse-collections#org.eclipse.colle...
4,0.188219,0.074646,0.072876,0.073073,0.073925,0.074252,0.090472,0.094634,0.087359,0.086114,...,0.074187,0.074514,0.075104,0.075104,0.074514,0.075563,0.074711,0.074646,433,eclipse__eclipse-collections#org.eclipse.colle...


In [11]:
train_data = train_data.drop('file_name', axis=1)
test_data = test_data.drop('file_name', axis=1)

In [12]:
X_train = train_data.drop('steady_state_starts', axis=1) 
y_train  = train_data['steady_state_starts']
X_test = test_data.drop('steady_state_starts', axis=1)
y_test = test_data['steady_state_starts']

### Standardizing the data

In [13]:
sc = StandardScaler()
sc.fit(X_train.T)
X_train = pd.DataFrame(sc.transform(X_train.T))
X_train = X_train.T

In [14]:
sc = StandardScaler()
sc.fit(X_test.T)
X_test = pd.DataFrame(sc.transform(X_test.T))
X_test = X_test.T

##Displaying data

In [15]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
0,2.493405,1.438427,0.275097,0.120852,0.038770,-0.045481,-0.041691,-0.018489,-0.052441,0.066009,...,0.093044,0.079742,0.031218,0.043033,0.068690,0.035965,0.033691,0.035057,0.040358,0.082089
1,19.850395,3.919269,1.088619,-0.649525,-0.757397,-1.104877,-0.836836,0.151414,0.906579,0.630874,...,-1.129076,-0.653716,-0.951656,-0.933434,-1.115561,-1.184451,-1.021878,-0.961322,-0.977089,-0.958776
2,7.365034,0.222137,0.013231,-0.114094,-0.095152,-0.026576,-0.056697,-0.115891,-0.116769,0.113669,...,0.113874,0.113235,0.137590,0.118472,0.136797,0.216178,0.107796,0.108538,0.227095,0.105448
3,25.698245,5.467191,2.361931,1.014769,-0.242448,-0.836839,0.831208,0.740838,0.125529,0.816314,...,1.934241,0.912192,0.640598,1.926576,0.788552,0.677549,0.775965,0.707994,0.705428,0.791918
4,6.930946,8.373801,3.318279,2.115085,0.717736,0.498182,-0.791478,1.750784,3.681676,3.679147,...,-0.335963,-0.336712,-0.337158,-0.338733,-0.337777,-0.115834,-0.326962,-0.325566,-0.303728,-0.327647
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2983,5.648390,1.205169,2.998285,2.901957,2.907753,2.900437,2.923936,2.937857,2.897029,3.058809,...,-0.223138,-0.351034,-0.360448,-0.347575,-0.221094,-0.253940,-0.255956,-0.257513,-0.261596,-0.260585
2984,6.128965,1.137952,2.819034,2.755642,2.697177,2.690744,2.728799,2.686920,2.706652,2.759365,...,-0.339418,-0.169677,-0.202522,-0.214544,-0.196766,-0.049341,-0.312705,-0.234998,-0.301529,-0.323573
2985,1.435363,4.310081,5.271857,5.179254,5.121581,5.101479,6.648961,-0.297599,0.030163,0.371756,...,-0.328705,-0.331775,-0.304673,-0.145808,-0.222837,1.920006,3.607995,-0.192926,-0.316728,-0.298249
2986,1.874207,5.394570,6.439192,6.042358,6.282721,6.022390,5.812937,-0.251018,0.221640,1.063728,...,-0.206482,-0.147103,-0.271931,-0.269170,-0.260052,-0.257041,-0.261558,-0.209188,-0.197348,-0.209019


In [16]:
y_train

0         2
1         1
2         1
3         1
4       337
       ... 
2983     53
2984    321
2985    395
2986    190
2987    140
Name: steady_state_starts, Length: 2988, dtype: int64

In [17]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
0,14.768957,0.099609,-0.242733,-0.234175,-0.259850,-0.088679,3.839693,0.621680,-0.439580,-0.396787,...,-0.217058,-0.157148,-0.259850,-0.251292,-0.165706,-0.251292,-0.217058,-0.242733,-0.225616,-0.199940
1,15.188710,-0.112622,-0.478391,-0.504517,-0.391303,-0.330342,5.042978,2.665480,1.672679,1.585591,...,-0.931248,-1.027044,0.026719,-0.147457,-0.147457,-0.156166,-0.278089,-0.069078,-0.278089,-0.199710
2,16.410016,0.019216,-0.387407,-0.271229,-0.290592,-0.077599,4.395260,3.088255,1.916793,1.616666,...,-1.229699,-1.249062,-0.300274,0.232209,-0.300274,-0.319637,-0.106643,-0.348681,-0.416452,-0.126006
3,16.643856,0.205260,-0.220120,-0.314649,-0.446990,-0.229573,6.056606,2.738638,1.793348,1.434138,...,-0.276838,2.606297,0.157996,0.110731,-0.276838,-0.049968,-0.295743,-0.116138,-0.220120,-0.314649
4,12.432917,-0.392268,-0.592084,-0.569882,-0.473675,-0.436672,1.394969,1.864905,1.043442,0.902831,...,-0.436672,-0.495876,-0.444072,-0.407070,-0.340464,-0.340464,-0.407070,-0.288660,-0.384868,-0.392268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1241,19.137939,3.263034,1.706507,1.562867,1.498745,1.511159,1.476804,1.534067,1.521579,1.638698,...,-0.127747,-0.126131,-0.084798,-0.104667,-0.111291,-0.114262,-0.115615,-0.116412,0.383745,-0.127419
1242,19.934724,3.509199,2.092171,1.990867,1.926031,2.479410,1.841468,1.809790,1.893525,0.505208,...,-0.105531,-0.100797,-0.093698,-0.108237,-0.106846,-0.103998,-0.106333,-0.105603,-0.105744,-0.105014
1243,19.660820,3.364129,1.706101,1.543446,1.494779,1.478590,1.458402,2.085710,2.197708,1.399425,...,-0.108854,-0.119680,-0.099818,-0.123788,-0.127690,-0.123647,-0.128348,-0.123541,-0.128068,-0.126601
1244,24.398501,4.705596,2.365611,2.217600,2.163380,2.062343,2.010119,1.981302,2.008285,2.152084,...,-0.127656,-0.119580,-0.115194,-0.121251,-0.124015,-0.114678,-0.123691,-0.119007,-0.125609,-0.126870


In [18]:
y_test

0       480
1       421
2       385
3       408
4       433
       ... 
1241     16
1242      8
1243     16
1244      9
1245     47
Name: steady_state_starts, Length: 1246, dtype: int64

##Finding best hyperparameter values

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [23]:
# Select an algorithm
algorithm = KNeighborsRegressor()

# Define our candidate hyperparameters
#The maximum k value in square root of number of samples. Hence 60 is provided.
hp_candidates = [{'n_neighbors': range(2,60), 'weights': ['distance']}]
grid = bestParams(algorithm, hp_candidates)

print(grid.best_score_)
print(grid.best_estimator_)
print(grid.best_params_)

0.11026621202442677
KNeighborsRegressor(n_neighbors=54, weights='distance')
{'n_neighbors': 54, 'weights': 'distance'}


In [24]:

# Select an algorithm
algorithm = KNeighborsRegressor()
# Define our candidate hyperparameters
#finding the best algorithm
hp_candidates = [{'n_neighbors': [54], 'weights': ['distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}]
grid = bestParams(algorithm, hp_candidates)

print(grid.best_score_)
print(grid.best_estimator_)
print(grid.best_params_)

0.11026621270323966
KNeighborsRegressor(algorithm='ball_tree', n_neighbors=54, weights='distance')
{'algorithm': 'ball_tree', 'n_neighbors': 54, 'weights': 'distance'}


#Full Dataset

## Reading Data

In [25]:
data = pd.read_csv('/content/drive/MyDrive/AV_V/all_and_classification.csv')  # load data set


data = data.drop('run', axis=1)
data['forks'] = data['forks'].replace(['steady state'], 1)
data['forks'] = data['forks'].replace(['no steady state'], 0)
data = data.loc[data['forks'] == 1]
data = data.drop('forks', axis=1)
data.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2992,2993,2994,2995,2996,2997,2998,2999,steady_state_starts,file_name
0,0.188482,0.075366,0.073728,0.072942,0.072614,0.07517,0.107217,0.087884,0.077857,0.076677,...,0.076481,0.076284,0.076349,0.076349,0.098763,0.076612,0.076153,0.076874,1765,eclipse__eclipse-collections#org.eclipse.colle...
1,0.188219,0.075891,0.073269,0.073335,0.073138,0.074449,0.10453,0.079888,0.071762,0.07209,...,0.073138,0.073204,0.073859,0.073204,0.073466,0.073269,0.0734,0.073597,480,eclipse__eclipse-collections#org.eclipse.colle...
2,0.190317,0.07517,0.072417,0.072221,0.073073,0.073531,0.113967,0.096076,0.088605,0.087949,...,0.076218,0.074908,0.074908,0.074842,0.073925,0.075497,0.073925,0.074514,421,eclipse__eclipse-collections#org.eclipse.colle...
3,0.190579,0.067994,0.073597,0.0734,0.073335,0.075235,0.106168,0.085983,0.078447,0.077464,...,0.074383,0.074973,0.074121,0.074842,0.083755,0.074646,0.07458,0.074711,1477,eclipse__eclipse-collections#org.eclipse.colle...
4,0.18586,0.074908,0.072155,0.072942,0.07281,0.074252,0.10453,0.095683,0.087753,0.085721,...,0.072745,0.076349,0.072745,0.072614,0.074056,0.072417,0.071959,0.073925,385,eclipse__eclipse-collections#org.eclipse.colle...


##Dividing data (train and test) based on file name 

In [26]:
data_file_name = data['file_name']
data_file_name

# data = data.drop('file_name', axis=1)


0       eclipse__eclipse-collections#org.eclipse.colle...
1       eclipse__eclipse-collections#org.eclipse.colle...
2       eclipse__eclipse-collections#org.eclipse.colle...
3       eclipse__eclipse-collections#org.eclipse.colle...
4       eclipse__eclipse-collections#org.eclipse.colle...
                              ...                        
5852    yellowstonegames__SquidLib#squidpony.performan...
5855    yellowstonegames__SquidLib#squidpony.performan...
5856    yellowstonegames__SquidLib#squidpony.performan...
5857    yellowstonegames__SquidLib#squidpony.performan...
5859    yellowstonegames__SquidLib#squidpony.performan...
Name: file_name, Length: 5219, dtype: object

In [27]:
data_file_name_unique = data_file_name.drop_duplicates(keep="first")

In [28]:
from sklearn.model_selection import train_test_split
data_file_name_train, data_file_name_test = train_test_split(data_file_name_unique, test_size=0.3,
                                                    random_state=1)

In [29]:
data_file_name_train 

4820    cantaloupe-project__cantaloupe#edu.illinois.li...
2240    RoaringBitmap__RoaringBitmap#org.roaringbitmap...
1021    raphw__byte-buddy#net.bytebuddy.benchmark.Supe...
520     jdbi__jdbi#org.jdbi.v3.benchmark.QualifiersBen...
3290    RoaringBitmap__RoaringBitmap#org.roaringbitmap...
                              ...                        
1290    cantaloupe-project__cantaloupe#edu.illinois.li...
1440    openzipkin__zipkin#zipkin2.internal.ReadBuffer...
720     openzipkin__zipkin#zipkin2.codec.ProtoCodecBen...
2350    yellowstonegames__SquidLib#squidpony.performan...
370     protostuff__protostuff#io.protostuff.benchmark...
Name: file_name, Length: 410, dtype: object

In [30]:
data_file_name_test

5120    apache__camel#org.apache.camel.itest.jmh.TypeC...
2250    zalando__logbook#org.zalando.logbook.HeaderBen...
230     zalando__logbook#org.zalando.logbook.HttpLogFo...
4510    protostuff__protostuff#io.protostuff.benchmark...
2420    jgrapht__jgrapht#org.jgrapht.perf.shortestpath...
                              ...                        
3640    eclipse__jetty.project#org.eclipse.jetty.util....
1570    cantaloupe-project__cantaloupe#edu.illinois.li...
160     netty__netty#io.netty.buffer.CompositeByteBufR...
5690    r2dbc__r2dbc-h2#io.r2dbc.h2.StagedResultSizeBe...
2180    netty__netty#io.netty.microbench.buffer.ByteBu...
Name: file_name, Length: 176, dtype: object

### Standardizing the data

In [31]:
data_without_file_name_and_y = data.drop('file_name', axis=1)
data_without_file_name_and_y = data_without_file_name_and_y.drop('steady_state_starts', axis=1)
sc = StandardScaler()
sc.fit(data_without_file_name_and_y.T)
data_std = pd.DataFrame(sc.transform(data_without_file_name_and_y.T))
data_std = data_std.T

In [32]:
data_std

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
0,15.729217,-0.446799,-0.681098,-0.793562,-0.840422,-0.474915,4.107977,1.343247,-0.090664,-0.259360,...,-0.437427,-0.362451,-0.287476,-0.315592,-0.306220,-0.306220,2.898993,-0.268732,-0.334336,-0.231244
1,14.768957,0.099609,-0.242733,-0.234175,-0.259850,-0.088679,3.839693,0.621680,-0.439580,-0.396787,...,-0.217058,-0.157148,-0.259850,-0.251292,-0.165706,-0.251292,-0.217058,-0.242733,-0.225616,-0.199940
2,15.188710,-0.112622,-0.478391,-0.504517,-0.391303,-0.330342,5.042978,2.665480,1.672679,1.585591,...,-0.931248,-1.027044,0.026719,-0.147457,-0.147457,-0.156166,-0.278089,-0.069078,-0.278089,-0.199710
3,17.012778,-1.129463,-0.300187,-0.329284,-0.338983,-0.057708,4.520286,1.532951,0.417550,0.272062,...,-0.135301,-0.183797,-0.183797,-0.096505,-0.222594,-0.115903,1.203180,-0.145001,-0.154700,-0.135301
4,16.410016,0.019216,-0.387407,-0.271229,-0.290592,-0.077599,4.395260,3.088255,1.916793,1.616666,...,-1.229699,-1.249062,-0.300274,0.232209,-0.300274,-0.319637,-0.106643,-0.348681,-0.416452,-0.126006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5214,3.136889,4.660223,-0.256262,0.023670,3.913239,5.662721,5.629702,5.735517,5.617517,9.586280,...,-0.304541,-0.287405,-0.299302,-0.122155,-0.275074,-0.277208,-0.276181,-0.275390,-0.278868,-0.225861
5215,1.874207,5.394570,6.439192,6.042358,6.282721,6.022390,5.812937,-0.251018,0.221640,1.063728,...,-0.206482,-0.147103,-0.271931,-0.269170,-0.260052,-0.257041,-0.261558,-0.209188,-0.197348,-0.209019
5216,3.137438,5.971989,-0.265681,-0.004903,4.313052,6.089951,6.076222,6.266721,6.106255,1.425192,...,1.058439,-0.349428,-0.313637,-0.315297,-0.318951,-0.315796,-0.195490,-0.215483,0.241000,4.485716
5217,1.904558,5.604098,6.647746,6.508702,6.354552,6.375264,5.634126,-0.244200,0.286686,0.940583,...,-0.254390,-0.279379,-0.287792,-0.238617,-0.311022,-0.097469,-0.274239,-0.276012,-0.174671,-0.277607


In [33]:
data_std = data_std.merge(data_file_name, left_index=True, right_index=True)
data_std = data_std.merge(data['steady_state_starts'], left_index=True, right_index=True)
data_std.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2992,2993,2994,2995,2996,2997,2998,2999,file_name,steady_state_starts
0,15.729217,-0.446799,-0.681098,-0.793562,-0.840422,-0.474915,4.107977,1.343247,-0.090664,-0.25936,...,-0.287476,-0.315592,-0.30622,-0.30622,2.898993,-0.268732,-0.334336,-0.231244,eclipse__eclipse-collections#org.eclipse.colle...,1765
1,14.768957,0.099609,-0.242733,-0.234175,-0.25985,-0.088679,3.839693,0.62168,-0.43958,-0.396787,...,-0.25985,-0.251292,-0.165706,-0.251292,-0.217058,-0.242733,-0.225616,-0.19994,eclipse__eclipse-collections#org.eclipse.colle...,480
2,15.18871,-0.112622,-0.478391,-0.504517,-0.391303,-0.330342,5.042978,2.66548,1.672679,1.585591,...,0.026719,-0.147457,-0.147457,-0.156166,-0.278089,-0.069078,-0.278089,-0.19971,eclipse__eclipse-collections#org.eclipse.colle...,421
3,17.012778,-1.129463,-0.300187,-0.329284,-0.338983,-0.057708,4.520286,1.532951,0.41755,0.272062,...,-0.183797,-0.096505,-0.222594,-0.115903,1.20318,-0.145001,-0.1547,-0.135301,eclipse__eclipse-collections#org.eclipse.colle...,1477
4,16.410016,0.019216,-0.387407,-0.271229,-0.290592,-0.077599,4.39526,3.088255,1.916793,1.616666,...,-0.300274,0.232209,-0.300274,-0.319637,-0.106643,-0.348681,-0.416452,-0.126006,eclipse__eclipse-collections#org.eclipse.colle...,385


In [34]:
train_data = pd.merge(data_std, data_file_name_train, left_on='file_name', right_on='file_name')
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2992,2993,2994,2995,2996,2997,2998,2999,file_name,steady_state_starts
0,19.850395,3.919269,1.088619,-0.649525,-0.757397,-1.104877,-0.836836,0.151414,0.906579,0.630874,...,-0.951656,-0.933434,-1.115561,-1.184451,-1.021878,-0.961322,-0.977089,-0.958776,JCTools__JCTools#org.jctools.channels.spsc.Sps...,2
1,2.562516,0.504552,0.342755,-0.072082,-0.006673,0.018932,-0.016947,-0.211087,-0.074588,0.022937,...,-0.147025,-0.215504,-0.149828,-0.150955,-0.100794,-0.152613,-0.152049,-0.159221,JCTools__JCTools#org.jctools.channels.spsc.Sps...,1
2,25.698245,5.467191,2.361931,1.014769,-0.242448,-0.836839,0.831208,0.740838,0.125529,0.816314,...,0.640598,1.926576,0.788552,0.677549,0.775965,0.707994,0.705428,0.791918,JCTools__JCTools#org.jctools.channels.spsc.Sps...,1344
3,1.958411,0.311209,0.068767,-0.073209,0.103914,0.044457,0.084911,0.125495,0.065075,-0.084904,...,0.074025,-0.080087,-0.084907,-0.082503,0.042396,0.037655,0.114309,0.041861,JCTools__JCTools#org.jctools.channels.spsc.Sps...,1
4,6.930946,8.373801,3.318279,2.115085,0.717736,0.498182,-0.791478,1.750784,3.681676,3.679147,...,-0.337158,-0.338733,-0.337777,-0.115834,-0.326962,-0.325566,-0.303728,-0.327647,JCTools__JCTools#org.jctools.channels.spsc.Sps...,1


In [35]:
test_data = pd.merge(data_std, data_file_name_test, left_on='file_name', right_on='file_name')
test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2992,2993,2994,2995,2996,2997,2998,2999,file_name,steady_state_starts
0,15.729217,-0.446799,-0.681098,-0.793562,-0.840422,-0.474915,4.107977,1.343247,-0.090664,-0.25936,...,-0.287476,-0.315592,-0.30622,-0.30622,2.898993,-0.268732,-0.334336,-0.231244,eclipse__eclipse-collections#org.eclipse.colle...,1765
1,14.768957,0.099609,-0.242733,-0.234175,-0.25985,-0.088679,3.839693,0.62168,-0.43958,-0.396787,...,-0.25985,-0.251292,-0.165706,-0.251292,-0.217058,-0.242733,-0.225616,-0.19994,eclipse__eclipse-collections#org.eclipse.colle...,480
2,15.18871,-0.112622,-0.478391,-0.504517,-0.391303,-0.330342,5.042978,2.66548,1.672679,1.585591,...,0.026719,-0.147457,-0.147457,-0.156166,-0.278089,-0.069078,-0.278089,-0.19971,eclipse__eclipse-collections#org.eclipse.colle...,421
3,17.012778,-1.129463,-0.300187,-0.329284,-0.338983,-0.057708,4.520286,1.532951,0.41755,0.272062,...,-0.183797,-0.096505,-0.222594,-0.115903,1.20318,-0.145001,-0.1547,-0.135301,eclipse__eclipse-collections#org.eclipse.colle...,1477
4,16.410016,0.019216,-0.387407,-0.271229,-0.290592,-0.077599,4.39526,3.088255,1.916793,1.616666,...,-0.300274,0.232209,-0.300274,-0.319637,-0.106643,-0.348681,-0.416452,-0.126006,eclipse__eclipse-collections#org.eclipse.colle...,385


In [36]:
train_data = train_data.drop('file_name', axis=1)
test_data = test_data.drop('file_name', axis=1)

In [37]:
X_train = train_data.drop('steady_state_starts', axis=1) 
y_train  = train_data['steady_state_starts']
X_test = test_data.drop('steady_state_starts', axis=1)
y_test = test_data['steady_state_starts']

In [38]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
0,19.850395,3.919269,1.088619,-0.649525,-0.757397,-1.104877,-0.836836,0.151414,0.906579,0.630874,...,-1.129076,-0.653716,-0.951656,-0.933434,-1.115561,-1.184451,-1.021878,-0.961322,-0.977089,-0.958776
1,2.562516,0.504552,0.342755,-0.072082,-0.006673,0.018932,-0.016947,-0.211087,-0.074588,0.022937,...,-0.153156,-0.145483,-0.147025,-0.215504,-0.149828,-0.150955,-0.100794,-0.152613,-0.152049,-0.159221
2,25.698245,5.467191,2.361931,1.014769,-0.242448,-0.836839,0.831208,0.740838,0.125529,0.816314,...,1.934241,0.912192,0.640598,1.926576,0.788552,0.677549,0.775965,0.707994,0.705428,0.791918
3,1.958411,0.311209,0.068767,-0.073209,0.103914,0.044457,0.084911,0.125495,0.065075,-0.084904,...,0.033093,0.039486,0.074025,-0.080087,-0.084907,-0.082503,0.042396,0.037655,0.114309,0.041861
4,6.930946,8.373801,3.318279,2.115085,0.717736,0.498182,-0.791478,1.750784,3.681676,3.679147,...,-0.335963,-0.336712,-0.337158,-0.338733,-0.337777,-0.115834,-0.326962,-0.325566,-0.303728,-0.327647
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3308,3.136889,4.660223,-0.256262,0.023670,3.913239,5.662721,5.629702,5.735517,5.617517,9.586280,...,-0.304541,-0.287405,-0.299302,-0.122155,-0.275074,-0.277208,-0.276181,-0.275390,-0.278868,-0.225861
3309,1.874207,5.394570,6.439192,6.042358,6.282721,6.022390,5.812937,-0.251018,0.221640,1.063728,...,-0.206482,-0.147103,-0.271931,-0.269170,-0.260052,-0.257041,-0.261558,-0.209188,-0.197348,-0.209019
3310,3.137438,5.971989,-0.265681,-0.004903,4.313052,6.089951,6.076222,6.266721,6.106255,1.425192,...,1.058439,-0.349428,-0.313637,-0.315297,-0.318951,-0.315796,-0.195490,-0.215483,0.241000,4.485716
3311,1.904558,5.604098,6.647746,6.508702,6.354552,6.375264,5.634126,-0.244200,0.286686,0.940583,...,-0.254390,-0.279379,-0.287792,-0.238617,-0.311022,-0.097469,-0.274239,-0.276012,-0.174671,-0.277607


In [39]:
y_train

0          2
1          1
2       1344
3          1
4          1
        ... 
3308      34
3309       3
3310       0
3311      43
3312       3
Name: steady_state_starts, Length: 3313, dtype: int64

In [40]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
0,15.729217,-0.446799,-0.681098,-0.793562,-0.840422,-0.474915,4.107977,1.343247,-0.090664,-0.259360,...,-0.437427,-0.362451,-0.287476,-0.315592,-0.306220,-0.306220,2.898993,-0.268732,-0.334336,-0.231244
1,14.768957,0.099609,-0.242733,-0.234175,-0.259850,-0.088679,3.839693,0.621680,-0.439580,-0.396787,...,-0.217058,-0.157148,-0.259850,-0.251292,-0.165706,-0.251292,-0.217058,-0.242733,-0.225616,-0.199940
2,15.188710,-0.112622,-0.478391,-0.504517,-0.391303,-0.330342,5.042978,2.665480,1.672679,1.585591,...,-0.931248,-1.027044,0.026719,-0.147457,-0.147457,-0.156166,-0.278089,-0.069078,-0.278089,-0.199710
3,17.012778,-1.129463,-0.300187,-0.329284,-0.338983,-0.057708,4.520286,1.532951,0.417550,0.272062,...,-0.135301,-0.183797,-0.183797,-0.096505,-0.222594,-0.115903,1.203180,-0.145001,-0.154700,-0.135301
4,16.410016,0.019216,-0.387407,-0.271229,-0.290592,-0.077599,4.395260,3.088255,1.916793,1.616666,...,-1.229699,-1.249062,-0.300274,0.232209,-0.300274,-0.319637,-0.106643,-0.348681,-0.416452,-0.126006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1337,6.646015,2.175950,2.976109,0.682660,1.710550,0.451408,0.264107,-0.098240,-0.116836,-0.101741,...,-0.129195,-0.114032,-0.121038,-0.120507,-0.131157,-0.129075,-0.129540,-0.133763,-0.123670,-0.121364
1338,5.013476,0.547533,2.417344,2.338879,2.334768,2.364399,2.357454,2.299245,2.350712,2.410073,...,-0.188026,-0.179000,-0.191830,-0.250155,-0.187138,-0.243065,-0.256042,-0.201937,-0.348957,5.649865
1339,3.968026,0.439111,1.868381,1.843385,1.824646,1.801049,1.793119,1.798880,1.806291,1.799633,...,-0.304525,-0.293686,-0.387285,-0.403954,-0.388946,-0.389759,-0.403468,-0.391349,-0.394913,-0.403236
1340,15.634223,2.942906,4.797148,4.684763,4.539094,4.691964,4.730527,4.725243,4.865671,4.833379,...,-0.177700,-0.042306,-0.147614,-0.244292,-0.228255,-0.231545,-0.247713,0.328117,-0.182624,-0.202054


In [41]:
y_test

0       1765
1        480
2        421
3       1477
4        385
        ... 
1337     147
1338    1883
1339     218
1340    2169
1341     425
Name: steady_state_starts, Length: 1342, dtype: int64

##Finding best hyperparameter values

In [42]:
# Select an algorithm
algorithm = KNeighborsRegressor()

# Define our candidate hyperparameters
#The maximum k value in square root of number of samples. Hence 60 is provided.
hp_candidates = [{'n_neighbors': range(2,60), 'weights': ['distance']}]
grid = bestParams(algorithm, hp_candidates)

print(grid.best_score_)
print(grid.best_estimator_)
print(grid.best_params_)

-0.01618979591200313
KNeighborsRegressor(n_neighbors=56, weights='distance')
{'n_neighbors': 56, 'weights': 'distance'}


In [43]:

# Select an algorithm
algorithm = KNeighborsRegressor()
# Define our candidate hyperparameters
#finding the best algorithm
hp_candidates = [{'n_neighbors': [56], 'weights': ['distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}]
grid = bestParams(algorithm, hp_candidates)

print(grid.best_score_)
print(grid.best_estimator_)
print(grid.best_params_)

-0.01618979591200313
KNeighborsRegressor(n_neighbors=56, weights='distance')
{'algorithm': 'auto', 'n_neighbors': 56, 'weights': 'distance'}
