In [1]:
from si.data.dataset import Dataset
from si.io.CSV import read_csv
from si.model_selection.cross_validate import cross_validate
from si.model_selection.grid_search import grid_search_cv
from si.model_selection.randomized_search import randomized_search_cv
from si.model_selection.split import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

#Models
from si.linear_model.logistic_regression import LogisticRegression

# Cross validation

In [2]:
#Importing data
data = read_csv("C:/Users/rober/si/datasets/breast/breast-bin.csv", ",", False, -1)

data.X = StandardScaler().fit_transform(data.X)

In [3]:
model = LogisticRegression()
scores = cross_validate(model, data, cv=5, test_size=0.3)

#print(scores)
scores_dataframe = pd.DataFrame(scores)
print(scores_dataframe)

   seed     train      test
0   778  0.965306  0.971292
1   712  0.967347  0.976077
2   921  0.971429  0.956938
3   232  0.967347  0.966507
4   480  0.963265  0.971292


# Grid search

In [4]:
model = LogisticRegression()

parameters = {"l2_penalty": [1, 10],
              "alpha": [0.001, 0.0001],
              "max_iter": [1000, 2000]}

all_scores, best_scores, model = grid_search_cv(model, data, parameters, cv=3, test_size=0.3, scale = False, verbose=True)
print()
print("BEST PARAMS:")
print(best_scores)

#for elem in scores:
#    print("\n|\n")
#    print("Parameters:\n-----------")
#    print(elem["parameters"])
#    print("\nScores:\n-------")
#    print(pd.DataFrame({k:v for k,v in elem.items() if k!="parameters"}))
#scores_dataframe = pd.DataFrame(scores)
#print(scores_dataframe)


Params:			 {'l2_penalty': 1, 'alpha': 0.001, 'max_iter': 1000}
Seeds:			 [731, 213, 491]
Train scores:	 [0.9714285714285714, 0.9673469387755103, 0.9755102040816327]
Test scores:	 [0.9569377990430622, 0.9665071770334929, 0.9473684210526315]

Params:			 {'l2_penalty': 1, 'alpha': 0.001, 'max_iter': 2000}
Seeds:			 [538, 254, 154]
Train scores:	 [0.9714285714285714, 0.9673469387755103, 0.9653061224489796]
Test scores:	 [0.9569377990430622, 0.9712918660287081, 0.9712918660287081]

Params:			 {'l2_penalty': 1, 'alpha': 0.0001, 'max_iter': 1000}
Seeds:			 [948, 245, 27]
Train scores:	 [0.9693877551020408, 0.963265306122449, 0.9653061224489796]
Test scores:	 [0.9617224880382775, 0.9760765550239234, 0.9712918660287081]

Params:			 {'l2_penalty': 1, 'alpha': 0.0001, 'max_iter': 2000}
Seeds:			 [27, 779, 898]
Train scores:	 [0.9653061224489796, 0.9693877551020408, 0.963265306122449]
Test scores:	 [0.9712918660287081, 0.9617224880382775, 0.9760765550239234]

Params:			 {'l2_penalty': 10, 'alpha'

In [6]:
train, test = train_test_split(data, 0.3, best_scores[0]["seed"])
model.score(test)

0.9760765550239234

# Randomized search

In [7]:
model = LogisticRegression()

parameters = {"l2_penalty": np.linspace(1,10,10).astype(int),
              "alpha": np.linspace(0.001, 0.0001, 100),
              "max_iter": np.linspace(1000, 2000, 200).astype(int)}

all_scores, best_scores, model = randomized_search_cv(model, data, parameters, cv=3, n_iter=10, test_size=0.3, scale=False)
print()
print("BEST PARAMS:")
print(best_scores)

#for elem in scores:
#    print("\n|\n")
#    print("Parameters:\n-----------")
#    print(elem["parameters"])
#    print("\nScores:\n-------")
#    print(pd.DataFrame({k:v for k,v in elem.items() if k!="parameters"}))


Params:			 {'l2_penalty': 8, 'alpha': 0.0002181818181818182, 'max_iter': 1608}
Seeds:			 [331, 919, 455]
Train scores:	 [0.9673469387755103, 0.9653061224489796, 0.9714285714285714]
Test scores:	 [0.9665071770334929, 0.9712918660287081, 0.9569377990430622]

Params:			 {'l2_penalty': 5, 'alpha': 0.0006090909090909092, 'max_iter': 1165}
Seeds:			 [342, 894, 211]
Train scores:	 [0.9755102040816327, 0.9673469387755103, 0.9591836734693877]
Test scores:	 [0.9473684210526315, 0.9665071770334929, 0.9856459330143541]

Params:			 {'l2_penalty': 6, 'alpha': 0.0009272727272727273, 'max_iter': 1924}
Seeds:			 [269, 847, 81]
Train scores:	 [0.9653061224489796, 0.9653061224489796, 0.9714285714285714]
Test scores:	 [0.9712918660287081, 0.9712918660287081, 0.9569377990430622]

Params:			 {'l2_penalty': 1, 'alpha': 0.0008, 'max_iter': 1979}
Seeds:			 [495, 64, 844]
Train scores:	 [0.9673469387755103, 0.9734693877551021, 0.9693877551020408]
Test scores:	 [0.9665071770334929, 0.9521531100478469, 0.9617224

In [9]:
train, test = train_test_split(data, 0.3, best_scores[0]["seed"])
model.score(test)

0.9904306220095693