In [1]:
##########################################
#### HERE ARE SOME EXAMPLES AND TESTS ####
##########################################


from core.models import *
from core.optimizers import *
from sklearn.datasets import load_breast_cancer, load_iris
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import classification_report
import pip

import numpy as np
import pandas as pd

In [2]:
# we have to get some data for training and testing our model
# iris is basic, well known dataset, we can use it 

iris = load_iris()       # loading whole dataset
X = iris.data[:,:2]               # we can cut data, we don't need the whole dataset
y = (iris.target != 0) * 1        # read target values, they have to be binary, so we do class 1 from class 2 


# now the train-test split
X_train, X_test, y_train, y_test = tts(X, y)

In [3]:
optimizer = RMSPropOptimizer(learning_rate=0.03)

model = LogisticRegression(optimizer=optimizer, num_iterations=200, fit_intercept=True)
model.fit(X_train, y_train)

<core.models.LogisticRegression at 0x27fa26663c8>

In [4]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        28

    accuracy                           1.00        38
   macro avg       1.00      1.00      1.00        38
weighted avg       1.00      1.00      1.00        38



In [5]:
# from matplotlib import pyplot as plt
# import seaborn as sns
# import pandas as pd

# df = pd.DataFrame(X_test)
# df['y'] = y_test
# df['pred'] = model.predict(X_test)

# a, b = model.theta[0], model.theta[1]
# foo = lambda x: a * x + b

# sns.scatterplot(data=df, x=0, y=1, hue='pred')
# plt.plot([4, 8], [foo(4), foo(8)])
# plt.show()

In [6]:
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler


def remove_correlated_features(X):
    corr_threshold = 0.9
    corr = X.corr()
    drop_columns = np.full(corr.shape[0], False, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i + 1, corr.shape[0]):
            if corr.iloc[i, j] >= corr_threshold:
                drop_columns[j] = True
    columns_dropped = X.columns[drop_columns]
    X.drop(columns_dropped, axis=1, inplace=True)
    return columns_dropped


def remove_less_significant_features(X, Y):
    sl = 0.05
    regression_ols = None
    columns_dropped = np.array([])
    for itr in range(0, len(X.columns)):
        regression_ols = sm.OLS(Y, X).fit()
        max_col = regression_ols.pvalues.idxmax()
        max_val = regression_ols.pvalues.max()
        if max_val > sl:
            X.drop(max_col, axis='columns', inplace=True)
            columns_dropped = np.append(columns_dropped, [max_col])
        else:
            break
    regression_ols.summary()
    return columns_dropped

In [10]:
print("reading dataset...")
# read data in pandas (pd) data frame
data = pd.read_csv('./datasets/breast_cancer.csv')

# drop last column (extra column added by pd)
# and unnecessary first column (id)
data.drop(data.columns[[-1, 0]], axis=1, inplace=True)

print("applying feature engineering...")
# convert categorical labels to numbers
diag_map = {'M': 1.0, 'B': -1.0}
data['diagnosis'] = data['diagnosis'].map(diag_map)

# put features & outputs in different data frames
Y = data.loc[:, 'diagnosis']
X = data.iloc[:, 1:]

# filter features
remove_correlated_features(X)
remove_less_significant_features(X, Y)

# normalize data for better convergence and to prevent overflow
X_normalized = MinMaxScaler().fit_transform(X.values)
X = pd.DataFrame(X_normalized)

# split data into train and test set
print("splitting dataset into train and test sets...")
X_train, X_test, y_train, y_test = tts(
    X, Y, test_size=0.2)

# train the model
print("training started...")
classifier = SVM()
classifier.fit(X_train.to_numpy(), y_train.to_numpy())
print("training finished.")

# testing the model
print("testing the model...")
# coś tu sie sypie, jutro zerknę o co chodzi, coś z wymiarami
# y_pr = classifier.predict(X_test)
# classifier.evaluate(y_test, y_pr)




reading dataset...
applying feature engineering...
splitting dataset into train and test sets...
training started...
Epoch is: 1 and Cost is: 7097.942267072146
Epoch is: 2 and Cost is: 6448.434998545462
Epoch is: 4 and Cost is: 5390.669680472762
Epoch is: 8 and Cost is: 3842.295840012008
Epoch is: 16 and Cost is: 2704.12416434212
Epoch is: 32 and Cost is: 1947.0545098690081
Epoch is: 64 and Cost is: 1529.1760504792114
Epoch is: 128 and Cost is: 1278.5002229709903
Epoch is: 256 and Cost is: 1141.0804777565406
Epoch is: 512 and Cost is: 1053.6175599556097
Epoch is: 1023 and Cost is: 1008.4292013948714
training finished.
testing the model...
