#Reduced Dataset

## python imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt  # To visualize
import pandas as pd  # To read data
from sklearn import linear_model, metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

##Mount Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Reading Data

In [3]:
data = pd.read_csv('/content/drive/MyDrive/AV_V/all_and_classification.csv')  # load data set


data = data.drop('run', axis=1)
data['forks'] = data['forks'].replace(['steady state'], 1)
data['forks'] = data['forks'].replace(['no steady state'], 0)
data = data.loc[data['forks'] == 1]
data = data.drop('forks', axis=1)
data = data.loc[data['steady_state_starts']<= 625]
data


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2992,2993,2994,2995,2996,2997,2998,2999,steady_state_starts,file_name
1,0.188219,0.075891,0.073269,0.073335,0.073138,0.074449,0.104530,0.079888,0.071762,0.072090,...,0.073138,0.073204,0.073859,0.073204,0.073466,0.073269,0.073400,0.073597,480,eclipse__eclipse-collections#org.eclipse.colle...
2,0.190317,0.075170,0.072417,0.072221,0.073073,0.073531,0.113967,0.096076,0.088605,0.087949,...,0.076218,0.074908,0.074908,0.074842,0.073925,0.075497,0.073925,0.074514,421,eclipse__eclipse-collections#org.eclipse.colle...
4,0.185860,0.074908,0.072155,0.072942,0.072810,0.074252,0.104530,0.095683,0.087753,0.085721,...,0.072745,0.076349,0.072745,0.072614,0.074056,0.072417,0.071959,0.073925,385,eclipse__eclipse-collections#org.eclipse.colle...
6,0.191103,0.077136,0.074187,0.073531,0.072614,0.074121,0.117703,0.094700,0.088146,0.085656,...,0.076808,0.076481,0.073794,0.075366,0.073662,0.074908,0.074187,0.073531,408,eclipse__eclipse-collections#org.eclipse.colle...
7,0.188219,0.074646,0.072876,0.073073,0.073925,0.074252,0.090472,0.094634,0.087359,0.086114,...,0.074187,0.074514,0.075104,0.075104,0.074514,0.075563,0.074711,0.074646,433,eclipse__eclipse-collections#org.eclipse.colle...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5848,0.000042,0.000024,0.000031,0.000031,0.000031,0.000031,0.000031,0.000031,0.000031,0.000031,...,0.000017,0.000017,0.000018,0.000018,0.000018,0.000018,0.000018,0.000018,53,apache__logging-log4j2#org.apache.logging.log4...
5849,0.000045,0.000023,0.000031,0.000030,0.000030,0.000030,0.000030,0.000030,0.000030,0.000030,...,0.000017,0.000017,0.000017,0.000018,0.000017,0.000017,0.000017,0.000017,321,apache__logging-log4j2#org.apache.logging.log4...
5851,0.001629,0.002516,0.002812,0.002784,0.002766,0.002760,0.003237,0.001095,0.001196,0.001301,...,0.001093,0.001142,0.001118,0.001779,0.002299,0.001127,0.001089,0.001095,395,yellowstonegames__SquidLib#squidpony.performan...
5855,0.001660,0.002597,0.002875,0.002770,0.002834,0.002764,0.002709,0.001095,0.001221,0.001445,...,0.001089,0.001090,0.001092,0.001093,0.001092,0.001106,0.001109,0.001106,190,yellowstonegames__SquidLib#squidpony.performan...


##Dividing data (train and test) based on file name 

In [4]:
data_file_name = data['file_name']

In [5]:
data_file_name_unique = data_file_name.drop_duplicates(keep="first")

In [6]:
from sklearn.model_selection import train_test_split
data_file_name_train, data_file_name_test = train_test_split(data_file_name_unique, test_size=0.3,
                                                    random_state=1)

In [7]:
train_data = pd.merge(data, data_file_name_train, left_on='file_name', right_on='file_name')

In [8]:
test_data = pd.merge(data, data_file_name_test, left_on='file_name', right_on='file_name')

In [9]:
train_data = train_data.drop('file_name', axis=1)
test_data = test_data.drop('file_name', axis=1)

In [10]:
X_train = train_data.drop('steady_state_starts', axis=1) 
y_train  = train_data['steady_state_starts']
X_test = test_data.drop('steady_state_starts', axis=1)
y_test = test_data['steady_state_starts']

### Standardizing the data

In [11]:
sc = StandardScaler()
sc.fit(X_train.T)
X_train = pd.DataFrame(sc.transform(X_train.T))
X_train = X_train.T

In [12]:
sc = StandardScaler()
sc.fit(X_test.T)
X_test = pd.DataFrame(sc.transform(X_test.T))
X_test = X_test.T

##Finding best hyperparameter values

In [13]:
from sklearn.gaussian_process import GaussianProcessRegressor

In [14]:
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel

kernel = DotProduct() + WhiteKernel()
gpr = GaussianProcessRegressor(kernel=kernel, random_state=0)
gpr.fit(X_train, y_train)
y_pred_gpr = gpr.predict(X_test)
gpr.score(X_train, y_train)

gpr_mae = mean_absolute_error(y_test, y_pred_gpr)
gpr_mse = mean_squared_error(y_test, y_pred_gpr)
gpr_rmse = mean_squared_error(y_test, y_pred_gpr, squared=False)
gpr_r2 = r2_score(y_test, y_pred_gpr)

print("MAE:", gpr_mae)
print("MSE:", gpr_mse)
print("RMSE:", gpr_rmse)
print("R2 Score:", gpr_r2)

MAE: 114.86565199726608
MSE: 22958.088045056036
RMSE: 151.51926625038823
R2 Score: -0.039033641855409806


In [15]:
from sklearn.gaussian_process.kernels import RBF, ConstantKernel

kernel = RBF() + ConstantKernel(constant_value=2)
gpr = GaussianProcessRegressor(kernel=kernel, random_state=0)
gpr.fit(X_train, y_train)
y_pred_gpr = gpr.predict(X_test)
gpr.score(X_train, y_train)

gpr_mae = mean_absolute_error(y_test, y_pred_gpr)
gpr_mse = mean_squared_error(y_test, y_pred_gpr)
gpr_rmse = mean_squared_error(y_test, y_pred_gpr, squared=False)
gpr_r2 = r2_score(y_test, y_pred_gpr)

print("MAE:", gpr_mae)
print("MSE:", gpr_mse)
print("RMSE:", gpr_rmse)
print("R2 Score:", gpr_r2)



MAE: 115.17578824821196
MSE: 22135.60161693318
RMSE: 148.78038048389706
R2 Score: -0.0018096767276529757


In [19]:
from sklearn.gaussian_process.kernels import ExpSineSquared

kernel = ExpSineSquared(length_scale=1, periodicity=1)
gpr = GaussianProcessRegressor(kernel=kernel, alpha=10000, random_state=0)
gpr.fit(X_train, y_train)
y_pred_gpr = gpr.predict(X_test)
gpr.score(X_train, y_train)

gpr_mae = mean_absolute_error(y_test, y_pred_gpr)
gpr_mse = mean_squared_error(y_test, y_pred_gpr)
gpr_rmse = mean_squared_error(y_test, y_pred_gpr, squared=False)
gpr_r2 = r2_score(y_test, y_pred_gpr)

print("MAE:", gpr_mae)
print("MSE:", gpr_mse)
print("RMSE:", gpr_rmse)
print("R2 Score:", gpr_r2)



MAE: 98.61034435540202
MSE: 28278.481096071162
RMSE: 168.16206794658288
R2 Score: -0.27982317785899324


#Full Dataset

## Reading Data

In [20]:
data = pd.read_csv('/content/drive/MyDrive/AV_V/all_and_classification.csv')  # load data set


data = data.drop('run', axis=1)
data['forks'] = data['forks'].replace(['steady state'], 1)
data['forks'] = data['forks'].replace(['no steady state'], 0)
data = data.loc[data['forks'] == 1]
data = data.drop('forks', axis=1)


##Dividing data (train and test) based on file name 

In [21]:
data_file_name = data['file_name']


In [22]:
data_file_name_unique = data_file_name.drop_duplicates(keep="first")

In [23]:
from sklearn.model_selection import train_test_split
data_file_name_train, data_file_name_test = train_test_split(data_file_name_unique, test_size=0.3,
                                                    random_state=1)

### Standardizing the data

In [24]:
data_without_file_name_and_y = data.drop('file_name', axis=1)
data_without_file_name_and_y = data_without_file_name_and_y.drop('steady_state_starts', axis=1)
sc = StandardScaler()
sc.fit(data_without_file_name_and_y.T)
data_std = pd.DataFrame(sc.transform(data_without_file_name_and_y.T))
data_std = data_std.T

In [25]:
data_std = data_std.merge(data_file_name, left_index=True, right_index=True)
data_std = data_std.merge(data['steady_state_starts'], left_index=True, right_index=True)

In [26]:
train_data = pd.merge(data_std, data_file_name_train, left_on='file_name', right_on='file_name')

In [27]:
test_data = pd.merge(data_std, data_file_name_test, left_on='file_name', right_on='file_name')

In [28]:
train_data = train_data.drop('file_name', axis=1)
test_data = test_data.drop('file_name', axis=1)

In [29]:
X_train = train_data.drop('steady_state_starts', axis=1) 
y_train  = train_data['steady_state_starts']
X_test = test_data.drop('steady_state_starts', axis=1)
y_test = test_data['steady_state_starts']

##Finding best hyperparameter values

In [30]:
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel

kernel = DotProduct() + WhiteKernel()
gpr = GaussianProcessRegressor(kernel=kernel, random_state=0)
gpr.fit(X_train, y_train)
y_pred_gpr = gpr.predict(X_test)
gpr.score(X_train, y_train)

gpr_mae = mean_absolute_error(y_test, y_pred_gpr)
gpr_mse = mean_squared_error(y_test, y_pred_gpr)
gpr_rmse = mean_squared_error(y_test, y_pred_gpr, squared=False)
gpr_r2 = r2_score(y_test, y_pred_gpr)

print("MAE:", gpr_mae)
print("MSE:", gpr_mse)
print("RMSE:", gpr_rmse)
print("R2 Score:", gpr_r2)



MAE: 483.8444367680692
MSE: 449462.27448700916
RMSE: 670.4194765122871
R2 Score: -0.004614182711674353


In [31]:
from sklearn.gaussian_process.kernels import RBF, ConstantKernel

kernel = RBF() + ConstantKernel(constant_value=2)
gpr = GaussianProcessRegressor(kernel=kernel, random_state=0)
gpr.fit(X_train, y_train)
y_pred_gpr = gpr.predict(X_test)
gpr.score(X_train, y_train)

gpr_mae = mean_absolute_error(y_test, y_pred_gpr)
gpr_mse = mean_squared_error(y_test, y_pred_gpr)
gpr_rmse = mean_squared_error(y_test, y_pred_gpr, squared=False)
gpr_r2 = r2_score(y_test, y_pred_gpr)

print("MAE:", gpr_mae)
print("MSE:", gpr_mse)
print("RMSE:", gpr_rmse)
print("R2 Score:", gpr_r2)



MAE: 483.99180930189715
MSE: 447452.1295035199
RMSE: 668.9186269670773
R2 Score: -0.00012121350302418676


In [32]:
from sklearn.gaussian_process.kernels import ExpSineSquared

kernel = ExpSineSquared(length_scale=1, periodicity=1)
gpr = GaussianProcessRegressor(kernel=kernel, alpha=10000, random_state=0)
gpr.fit(X_train, y_train)
y_pred_gpr = gpr.predict(X_test)
gpr.score(X_train, y_train)

gpr_mae = mean_absolute_error(y_test, y_pred_gpr)
gpr_mse = mean_squared_error(y_test, y_pred_gpr)
gpr_rmse = mean_squared_error(y_test, y_pred_gpr, squared=False)
gpr_r2 = r2_score(y_test, y_pred_gpr)

print("MAE:", gpr_mae)
print("MSE:", gpr_mse)
print("RMSE:", gpr_rmse)
print("R2 Score:", gpr_r2)



MAE: 384.7450991114654
MSE: 531188.3314463728
RMSE: 728.8266813491207
R2 Score: -0.18728392070511735
