In [1]:
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import pandas as pd
from copy import copy

In [2]:
df = pd.read_csv('../datasets/heart.csv')

In [3]:
df.isna().value_counts()

age    sex    cp     trestbps  chol   fbs    restecg  thalach  exang  oldpeak  slope  ca     thal   target
False  False  False  False     False  False  False    False    False  False    False  False  False  False     1025
dtype: int64

In [4]:
df.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [5]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [6]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=42)

# comparar o RMSE das seguintes implementações do Scikit Learn

In [8]:
models = {
  'lr': LinearRegression(),
  'sgd': SGDRegressor(),
  'ridge': Ridge(),
  'lasso': Lasso(),
  'enet': ElasticNet()
}

In [9]:
pipe = Pipeline([
  ('pre', ColumnTransformer([
    ('std', StandardScaler(), [x for x in range(13)])
  ])),
  ('est', None)
])

In [10]:
params = [
  {
    "est__fit_intercept": [True, False],
    "est__copy_X": [True, False],
    "est__positive": [True, False],
    "est": [models['lr']]
  },
  {
    "est__penalty": ['l1', 'l2', 'elasticnet'],
    "est__alpha": [1e-3, 1e-4, 1e-5], 
    "est__max_iter": [1500, 1000, 500],
    "est__early_stopping": [True, False],
    "est__random_state": [42],
    "est": [models['sgd']]
  },
  {
    "est__alpha": [0.5, 1.0, 1.5],
    "est__max_iter": [None, 1000, 5000],
    "est__random_state": [42],
    "est": [models['ridge']]
  },
  {
    "est__alpha": [0.5, 1.0, 1.5],
    "est__max_iter": [500, 1000, 1500],
    "est__random_state": [42],
    "est": [models['lasso']]
  },
  {
    "est__alpha": [0.5, 1.0, 1.5], 
    "est__l1_ratio": [0.1, 0.5, 0.9], 
    "est__max_iter": [500, 1000, 1500],
    "est__random_state": [42],
    "est": [models['enet']]
  }
]

In [11]:
grid_search = GridSearchCV(pipe, params, cv=3, n_jobs=-1, return_train_score=True, scoring='neg_root_mean_squared_error')

In [12]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('pre',
                                        ColumnTransformer(transformers=[('std',
                                                                         StandardScaler(),
                                                                         [0, 1,
                                                                          2, 3,
                                                                          4, 5,
                                                                          6, 7,
                                                                          8, 9,
                                                                          10,
                                                                          11,
                                                                          12])])),
                                       ('est', None)]),
             n_jobs=-1,
             param_grid=[{'est': [LinearRegr

In [13]:
y_pred = grid_search.predict(X_test)

In [14]:
print( -1 * (mean_squared_error(y_pred=y_pred, y_true=y_test)**0.5))

-0.35477331148305363


In [15]:
res = pd.DataFrame(grid_search.cv_results_)

In [16]:
res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_est,param_est__copy_X,param_est__fit_intercept,param_est__positive,param_est__alpha,param_est__early_stopping,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.002906,0.000347,0.000976,0.000068,LinearRegression(),True,True,True,,,...,-0.408159,-0.446001,-0.414480,0.023583,69,-0.420502,-0.409594,-0.391547,-0.407215,0.011940
1,0.050557,0.008624,0.000947,0.000007,LinearRegression(),True,True,False,,,...,-0.338478,-0.391365,-0.359093,0.023111,64,-0.352692,-0.354747,-0.329169,-0.345536,0.011604
2,0.002279,0.000241,0.000944,0.000074,LinearRegression(),True,False,True,,,...,-0.669386,-0.676608,-0.659517,0.019291,106,-0.676007,-0.635802,-0.654674,-0.655494,0.016424
3,0.041822,0.002362,0.000849,0.000132,LinearRegression(),True,False,False,,,...,-0.630327,-0.633520,-0.626791,0.007375,104,-0.636046,-0.601932,-0.619388,-0.619122,0.013928
4,0.001672,0.000299,0.000675,0.000170,LinearRegression(),False,True,True,,,...,-0.408159,-0.446001,-0.414480,0.023583,69,-0.420502,-0.409594,-0.391547,-0.407215,0.011940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,0.002304,0.000053,0.000916,0.000002,ElasticNet(),,,,1.5,,...,-0.502042,-0.501060,-0.501676,0.000438,80,-0.499141,-0.499812,-0.499391,-0.499448,0.000277
103,0.002346,0.000088,0.000751,0.000119,ElasticNet(),,,,1.5,,...,-0.502042,-0.501060,-0.501676,0.000438,80,-0.499141,-0.499812,-0.499391,-0.499448,0.000277
104,0.002466,0.000361,0.000943,0.000085,ElasticNet(),,,,1.5,,...,-0.502042,-0.501060,-0.501676,0.000438,80,-0.499141,-0.499812,-0.499391,-0.499448,0.000277
105,0.002122,0.000363,0.000714,0.000074,ElasticNet(),,,,1.5,,...,-0.502042,-0.501060,-0.501676,0.000438,80,-0.499141,-0.499812,-0.499391,-0.499448,0.000277


In [17]:
res.sort_values(by='rank_test_score', inplace=True)
res.reset_index(drop=True, inplace=True)

In [18]:
res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_est,param_est__copy_X,param_est__fit_intercept,param_est__positive,param_est__alpha,param_est__early_stopping,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.005747,0.000215,0.000849,0.000093,"SGDRegressor(alpha=0.001, early_stopping=True,...",,,,0.001,True,...,-0.337899,-0.392025,-0.357491,0.024493,1,-0.353714,-0.355147,-0.329395,-0.346085,0.011816
1,0.025344,0.029307,0.000666,0.000140,"SGDRegressor(alpha=0.001, early_stopping=True,...",,,,0.001,True,...,-0.337899,-0.392025,-0.357491,0.024493,1,-0.353714,-0.355147,-0.329395,-0.346085,0.011816
2,0.006671,0.001625,0.000854,0.000037,"SGDRegressor(alpha=0.001, early_stopping=True,...",,,,0.001,True,...,-0.337899,-0.392025,-0.357491,0.024493,1,-0.353714,-0.355147,-0.329395,-0.346085,0.011816
3,0.005198,0.001107,0.000760,0.000125,"SGDRegressor(alpha=0.001, early_stopping=True,...",,,,0.001,True,...,-0.337899,-0.391988,-0.357498,0.024464,4,-0.353693,-0.355145,-0.329398,-0.346079,0.011810
4,0.004993,0.000223,0.000818,0.000156,"SGDRegressor(alpha=0.001, early_stopping=True,...",,,,0.001,True,...,-0.337899,-0.391988,-0.357498,0.024464,4,-0.353693,-0.355145,-0.329398,-0.346079,0.011810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,0.002472,0.000012,0.000944,0.000028,ElasticNet(),,,,0.5,,...,-0.502042,-0.501060,-0.501676,0.000438,80,-0.499141,-0.499812,-0.499391,-0.499448,0.000277
103,0.041822,0.002362,0.000849,0.000132,LinearRegression(),True,False,False,,,...,-0.630327,-0.633520,-0.626791,0.007375,104,-0.636046,-0.601932,-0.619388,-0.619122,0.013928
104,0.002078,0.000304,0.000832,0.000100,LinearRegression(),False,False,False,,,...,-0.630327,-0.633520,-0.626791,0.007375,104,-0.636046,-0.601932,-0.619388,-0.619122,0.013928
105,0.002279,0.000241,0.000944,0.000074,LinearRegression(),True,False,True,,,...,-0.669386,-0.676608,-0.659517,0.019291,106,-0.676007,-0.635802,-0.654674,-0.655494,0.016424


In [19]:
res['model'] = res['param_est'].astype(str).str.split('(',1,True)[0]

In [20]:
best_of_each = res.groupby('model')['rank_test_score'].idxmin()

In [21]:
res.iloc[best_of_each]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_est,param_est__copy_X,param_est__fit_intercept,param_est__positive,param_est__alpha,param_est__early_stopping,...,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score,model
65,0.00271,0.000264,0.000909,1.3e-05,ElasticNet(),,,,0.5,,...,-0.397702,-0.383052,0.010605,66,-0.380034,-0.38449,-0.361602,-0.375375,0.009907,ElasticNet
93,0.002485,4.9e-05,0.000994,0.000114,Lasso(),,,,1.5,,...,-0.50106,-0.501676,0.000438,80,-0.499141,-0.499812,-0.499391,-0.499448,0.000277,Lasso
63,0.050557,0.008624,0.000947,7e-06,LinearRegression(),True,True,False,,,...,-0.391365,-0.359093,0.023111,64,-0.352692,-0.354747,-0.329169,-0.345536,0.011604,LinearRegression
54,0.002292,0.000129,0.000912,3.3e-05,Ridge(),,,,1.5,,...,-0.391244,-0.35905,0.023055,55,-0.352692,-0.354747,-0.329169,-0.345536,0.011604,Ridge
0,0.005747,0.000215,0.000849,9.3e-05,"SGDRegressor(alpha=0.001, early_stopping=True,...",,,,0.001,True,...,-0.392025,-0.357491,0.024493,1,-0.353714,-0.355147,-0.329395,-0.346085,0.011816,SGDRegressor


# Implemente o algoritmo Regressão Logística

In [22]:
from logistic_regression import LogisticRegression
from sklearn.linear_model import LogisticRegression as LogisticRegressionSkl
from sklearn.metrics import f1_score

# Treine e avalie (de acordo com a métrica F1-Score), usando a Regressão Logística implementada por você

In [23]:
my_lr = LogisticRegression()

In [24]:
my_lr.fit(X_train, y_train)

In [25]:
y_pred = my_lr.predict(X_test)

In [26]:
f1_score(y_pred=y_pred, y_true=y_test)

0.4539007092198582

# Compare o resultado de sua implementação com a implementação LogisticRegression do scikit learn em um grid search, que varia para a implementação do scikit learn, os seguintes hiper-parâmetros: penalty, C, solver, max_iter

In [27]:
skl_lr = LogisticRegressionSkl()

In [28]:
skl_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [29]:
y_pred = skl_lr.predict(X_test)

In [30]:
f1_score(y_pred=y_pred, y_true=y_test)

0.8508771929824561

In [31]:
lr_params = {
  "penalty": ['l2', 'none'], 
  "C": [0.5,1.0,1.5], 
  "solver": ['newton-cg', 'lbfgs', 'sag'], 
  "max_iter": [50,100,150],
  "random_state": [42]
}

In [32]:
grid_search_lr = GridSearchCV(LogisticRegressionSkl(verbose=0), lr_params, cv=3, n_jobs=-1, return_train_score=True, scoring='f1')

In [33]:
grid_search_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

GridSearchCV(cv=3, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [0.5, 1.0, 1.5], 'max_iter': [50, 100, 150],
                         'penalty': ['l2', 'none'], 'random_state': [42],
                         'solver': ['newton-cg', 'lbfgs', 'sag']},
             return_train_score=True, scoring='f1')

In [34]:
grid_search_lr.best_params_

{'C': 1.5,
 'max_iter': 50,
 'penalty': 'l2',
 'random_state': 42,
 'solver': 'lbfgs'}

In [35]:
y_pred = grid_search_lr.predict(X_test)

In [36]:
f1_score(y_pred=y_pred, y_true=y_test)

0.8533333333333333

In [37]:
my_lr = LogisticRegression()

In [38]:
my_lr.fit(X_train, y_train)

In [39]:
y_pred = my_lr.predict(X_test)

In [40]:
f1_score(y_pred=y_pred, y_true=y_test)

0.4539007092198582