#Reduced Dataset

## python imports

In [5]:
import numpy as np
import matplotlib.pyplot as plt  # To visualize
import pandas as pd  # To read data
from sklearn import linear_model, metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [6]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

seed = 13
kfold = KFold(n_splits=3, shuffle=True, random_state=seed)

def bestParams(algorithm, hp_candidates):
  grid = GridSearchCV(estimator=algorithm, param_grid=hp_candidates, cv=kfold, scoring='r2')
  grid.fit(X_train, y_train)
  return grid

##Mount Drive

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Reading Data

In [8]:
data = pd.read_csv('/content/drive/MyDrive/AV_V/all_and_classification.csv')  # load data set


data = data.drop('run', axis=1)
data['forks'] = data['forks'].replace(['steady state'], 1)
data['forks'] = data['forks'].replace(['no steady state'], 0)
data = data.loc[data['forks'] == 1]
data = data.drop('forks', axis=1)
data = data.loc[data['steady_state_starts']<= 625]
data


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2992,2993,2994,2995,2996,2997,2998,2999,steady_state_starts,file_name
1,0.188219,0.075891,0.073269,0.073335,0.073138,0.074449,0.104530,0.079888,0.071762,0.072090,...,0.073138,0.073204,0.073859,0.073204,0.073466,0.073269,0.073400,0.073597,480,eclipse__eclipse-collections#org.eclipse.colle...
2,0.190317,0.075170,0.072417,0.072221,0.073073,0.073531,0.113967,0.096076,0.088605,0.087949,...,0.076218,0.074908,0.074908,0.074842,0.073925,0.075497,0.073925,0.074514,421,eclipse__eclipse-collections#org.eclipse.colle...
4,0.185860,0.074908,0.072155,0.072942,0.072810,0.074252,0.104530,0.095683,0.087753,0.085721,...,0.072745,0.076349,0.072745,0.072614,0.074056,0.072417,0.071959,0.073925,385,eclipse__eclipse-collections#org.eclipse.colle...
6,0.191103,0.077136,0.074187,0.073531,0.072614,0.074121,0.117703,0.094700,0.088146,0.085656,...,0.076808,0.076481,0.073794,0.075366,0.073662,0.074908,0.074187,0.073531,408,eclipse__eclipse-collections#org.eclipse.colle...
7,0.188219,0.074646,0.072876,0.073073,0.073925,0.074252,0.090472,0.094634,0.087359,0.086114,...,0.074187,0.074514,0.075104,0.075104,0.074514,0.075563,0.074711,0.074646,433,eclipse__eclipse-collections#org.eclipse.colle...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5848,0.000042,0.000024,0.000031,0.000031,0.000031,0.000031,0.000031,0.000031,0.000031,0.000031,...,0.000017,0.000017,0.000018,0.000018,0.000018,0.000018,0.000018,0.000018,53,apache__logging-log4j2#org.apache.logging.log4...
5849,0.000045,0.000023,0.000031,0.000030,0.000030,0.000030,0.000030,0.000030,0.000030,0.000030,...,0.000017,0.000017,0.000017,0.000018,0.000017,0.000017,0.000017,0.000017,321,apache__logging-log4j2#org.apache.logging.log4...
5851,0.001629,0.002516,0.002812,0.002784,0.002766,0.002760,0.003237,0.001095,0.001196,0.001301,...,0.001093,0.001142,0.001118,0.001779,0.002299,0.001127,0.001089,0.001095,395,yellowstonegames__SquidLib#squidpony.performan...
5855,0.001660,0.002597,0.002875,0.002770,0.002834,0.002764,0.002709,0.001095,0.001221,0.001445,...,0.001089,0.001090,0.001092,0.001093,0.001092,0.001106,0.001109,0.001106,190,yellowstonegames__SquidLib#squidpony.performan...


##Dividing data (train and test) based on file name 

In [9]:
data_file_name = data['file_name']

1       eclipse__eclipse-collections#org.eclipse.colle...
2       eclipse__eclipse-collections#org.eclipse.colle...
4       eclipse__eclipse-collections#org.eclipse.colle...
6       eclipse__eclipse-collections#org.eclipse.colle...
7       eclipse__eclipse-collections#org.eclipse.colle...
                              ...                        
5848    apache__logging-log4j2#org.apache.logging.log4...
5849    apache__logging-log4j2#org.apache.logging.log4...
5851    yellowstonegames__SquidLib#squidpony.performan...
5855    yellowstonegames__SquidLib#squidpony.performan...
5856    yellowstonegames__SquidLib#squidpony.performan...
Name: file_name, Length: 4234, dtype: object

In [10]:
data_file_name_unique = data_file_name.drop_duplicates(keep="first")

In [11]:
from sklearn.model_selection import train_test_split
data_file_name_train, data_file_name_test = train_test_split(data_file_name_unique, test_size=0.3,
                                                    random_state=1)

In [14]:
train_data = pd.merge(data, data_file_name_train, left_on='file_name', right_on='file_name')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2992,2993,2994,2995,2996,2997,2998,2999,steady_state_starts,file_name
0,1.275195e-07,9.92695e-08,6.811809e-08,6.398774e-08,6.178975e-08,5.95337e-08,5.963519e-08,6.025651e-08,5.934733e-08,6.251917e-08,...,6.158755e-08,6.190392e-08,6.259095e-08,6.171464e-08,6.165376e-08,6.169033e-08,6.183229e-08,6.294976e-08,2,JCTools__JCTools#org.jctools.channels.spsc.Sps...
1,1.35583e-07,8.028709e-08,7.046209e-08,6.442911e-08,6.405469e-08,6.284861e-08,6.377896e-08,6.720911e-08,6.983024e-08,6.887329e-08,...,6.338043e-08,6.344368e-08,6.281152e-08,6.257241e-08,6.313669e-08,6.334688e-08,6.329215e-08,6.335572e-08,1,JCTools__JCTools#org.jctools.channels.spsc.Sps...
2,2.332804e-07,7.014395e-08,6.537276e-08,6.246478e-08,6.28974e-08,6.446362e-08,6.377569e-08,6.242375e-08,6.24037e-08,6.766667e-08,...,6.821299e-08,6.777635e-08,6.819488e-08,7.000788e-08,6.753254e-08,6.754947e-08,7.025719e-08,6.747891e-08,1,JCTools__JCTools#org.jctools.channels.spsc.Sps...
3,1.29803e-07,7.878544e-08,7.095477e-08,6.755758e-08,6.43872e-08,6.28883e-08,6.709468e-08,6.686679e-08,6.531514e-08,6.705712e-08,...,6.661401e-08,6.985692e-08,6.698712e-08,6.670719e-08,6.695537e-08,6.678397e-08,6.67775e-08,6.69956e-08,1,JCTools__JCTools#org.jctools.channels.spsc.Sps...
4,6.84424e-05,7.579114e-05,5.004239e-05,4.391429e-05,3.679732e-05,3.567909e-05,2.911061e-05,4.205884e-05,5.189324e-05,5.188036e-05,...,3.142454e-05,3.141652e-05,3.142139e-05,3.255179e-05,3.147647e-05,3.148359e-05,3.159481e-05,3.147299e-05,337,apache__camel#org.apache.camel.converter.Conve...


In [15]:
test_data = pd.merge(data, data_file_name_test, left_on='file_name', right_on='file_name')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2992,2993,2994,2995,2996,2997,2998,2999,steady_state_starts,file_name
0,0.188219,0.075891,0.073269,0.073335,0.073138,0.074449,0.10453,0.079888,0.071762,0.07209,...,0.073138,0.073204,0.073859,0.073204,0.073466,0.073269,0.0734,0.073597,480,eclipse__eclipse-collections#org.eclipse.colle...
1,0.190317,0.07517,0.072417,0.072221,0.073073,0.073531,0.113967,0.096076,0.088605,0.087949,...,0.076218,0.074908,0.074908,0.074842,0.073925,0.075497,0.073925,0.074514,421,eclipse__eclipse-collections#org.eclipse.colle...
2,0.18586,0.074908,0.072155,0.072942,0.07281,0.074252,0.10453,0.095683,0.087753,0.085721,...,0.072745,0.076349,0.072745,0.072614,0.074056,0.072417,0.071959,0.073925,385,eclipse__eclipse-collections#org.eclipse.colle...
3,0.191103,0.077136,0.074187,0.073531,0.072614,0.074121,0.117703,0.0947,0.088146,0.085656,...,0.076808,0.076481,0.073794,0.075366,0.073662,0.074908,0.074187,0.073531,408,eclipse__eclipse-collections#org.eclipse.colle...
4,0.188219,0.074646,0.072876,0.073073,0.073925,0.074252,0.090472,0.094634,0.087359,0.086114,...,0.074187,0.074514,0.075104,0.075104,0.074514,0.075563,0.074711,0.074646,433,eclipse__eclipse-collections#org.eclipse.colle...


In [16]:
train_data = train_data.drop('file_name', axis=1)
test_data = test_data.drop('file_name', axis=1)

In [17]:
X_train = train_data.drop('steady_state_starts', axis=1) 
y_train  = train_data['steady_state_starts']
X_test = test_data.drop('steady_state_starts', axis=1)
y_test = test_data['steady_state_starts']

### Standardizing the data

In [18]:
sc = StandardScaler()
sc.fit(X_train.T)
X_train = pd.DataFrame(sc.transform(X_train.T))
X_train = X_train.T

In [19]:
sc = StandardScaler()
sc.fit(X_test.T)
X_test = pd.DataFrame(sc.transform(X_test.T))
X_test = X_test.T

##Finding best hyperparameter values

In [24]:
from sklearn import linear_model
import numpy as np

In [32]:
parameters = {'alpha' : np.linspace(1, 21, 21)}
lassoReg = linear_model.Lasso()

grid = bestParams(lassoReg, parameters)

print(grid.best_score_)
print(grid.best_estimator_)
print(grid.best_params_)

0.0511402242264977
Lasso(alpha=6.0)
{'alpha': 6.0}


#Full Dataset

## Reading Data

In [34]:
data = pd.read_csv('/content/drive/MyDrive/AV_V/all_and_classification.csv')  # load data set


data = data.drop('run', axis=1)
data['forks'] = data['forks'].replace(['steady state'], 1)
data['forks'] = data['forks'].replace(['no steady state'], 0)
data = data.loc[data['forks'] == 1]
data = data.drop('forks', axis=1)


##Dividing data (train and test) based on file name 

In [35]:
data_file_name = data['file_name']


In [36]:
data_file_name_unique = data_file_name.drop_duplicates(keep="first")

In [37]:
from sklearn.model_selection import train_test_split
data_file_name_train, data_file_name_test = train_test_split(data_file_name_unique, test_size=0.3,
                                                    random_state=1)

### Standardizing the data

In [38]:
data_without_file_name_and_y = data.drop('file_name', axis=1)
data_without_file_name_and_y = data_without_file_name_and_y.drop('steady_state_starts', axis=1)
sc = StandardScaler()
sc.fit(data_without_file_name_and_y.T)
data_std = pd.DataFrame(sc.transform(data_without_file_name_and_y.T))
data_std = data_std.T

In [39]:
data_std = data_std.merge(data_file_name, left_index=True, right_index=True)
data_std = data_std.merge(data['steady_state_starts'], left_index=True, right_index=True)

In [40]:
train_data = pd.merge(data_std, data_file_name_train, left_on='file_name', right_on='file_name')

In [41]:
test_data = pd.merge(data_std, data_file_name_test, left_on='file_name', right_on='file_name')

In [42]:
train_data = train_data.drop('file_name', axis=1)
test_data = test_data.drop('file_name', axis=1)

In [43]:
X_train = train_data.drop('steady_state_starts', axis=1) 
y_train  = train_data['steady_state_starts']
X_test = test_data.drop('steady_state_starts', axis=1)
y_test = test_data['steady_state_starts']

##Finding best hyperparameter values

In [47]:
parameters = {'alpha' : np.linspace(1, 200, 200)}
lassoReg = linear_model.Lasso()

grid = bestParams(lassoReg, parameters)

print(grid.best_score_)
print(grid.best_estimator_)
print(grid.best_params_)

0.00017435351516363143
Lasso(alpha=73.0)
{'alpha': 73.0}
