# Regressão Logística: interpretação

Bibliografia: An Introduction do Statistical Learning (2º Ed) 

In [185]:
from catboost.datasets import titanic

from math import exp
import pandas as pd

from sklearn.linear_model import LogisticRegression
from statsmodels.discrete.discrete_model import Logit
from statsmodels.tools.tools import add_constant

In [148]:
#dados
df, _ = titanic()

#eliminando linhas zeradas de Age (Cabin desconsiderada)
df.dropna(subset=['Age'], inplace=True)
df.info()
df

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  714 non-null    int64  
 1   Survived     714 non-null    int64  
 2   Pclass       714 non-null    int64  
 3   Name         714 non-null    object 
 4   Sex          714 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        714 non-null    int64  
 7   Parch        714 non-null    int64  
 8   Ticket       714 non-null    object 
 9   Fare         714 non-null    float64
 10  Cabin        185 non-null    object 
 11  Embarked     712 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 72.5+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Regressão Logística

In [177]:
lista_X = ['Fare']
X = df[lista_X].to_numpy()
y = df['Survived'].to_numpy()

In [184]:
#modelo 1: regressão logística do scikitlearn com hiperparâmetros padrões
model1 = LogisticRegression(fit_intercept=False).fit(X, y)

print(round(model1.score(X, y), 3))
print(model1.predict(X)[:5])
print(model1.predict_log_proba(X)[:5, 1].round(3))
print(model1.predict_proba(X)[:5, 1].round(3))
print(model1.intercept_.round(3))
print(model1.coef_.round(3))

#"porque precisa especificar o [,1] do predict?"
#https://stackoverflow.com/questions/36681449/scikit-learn-return-value-of-logisticregression-predict-proba

0.413
[1 1 1 1 1]
[-0.678 -0.557 -0.677 -0.59  -0.677]
[0.507 0.573 0.508 0.555 0.508]
[0.]
[[0.004]]


In [187]:
#calculando manualmente a probabilidade

def probab_survived_fare(coef, fare):
    #calcular manualmente a probabilidade de uma pessoa sobreviver ao Titanic com base no Fare

    prob = exp(coef * fare) / (1 + exp(coef * fare))
    print(round(prob, 3))

    if prob >= 0.5:
        return print('Survived = 1')
    else:
        return print('Survived = 0')

        
print(df.head(2))

#caso mr.Braund (Survived=0)
braund = df.loc[0, 'Fare']
probab_survived_fare(model1.coef_[0][0], braund)

#caso mrs. Cumings (Survived=1)
cummings = df.loc[1, 'Fare']
probab_survived_fare(model1.coef_[0][0], cummings)


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   

   Parch     Ticket     Fare Cabin Embarked  
0      0  A/5 21171   7.2500   NaN        S  
1      0   PC 17599  71.2833   C85        C  
0.507
Survived = 1
0.573
Survived = 1


In [179]:
#modelo 2: regressão logística do statsmodel com hiperparâmetros padrões
#model2 = Logit(y, add_constant(X)).fit()
model2 = Logit(y, X).fit()


model2.summary()

Optimization terminated successfully.
         Current function value: 0.685885
         Iterations 4


0,1,2,3
Dep. Variable:,y,No. Observations:,714.0
Model:,Logit,Df Residuals:,713.0
Method:,MLE,Df Model:,0.0
Date:,"Sun, 06 Mar 2022",Pseudo R-squ.:,-0.01548
Time:,18:08:20,Log-Likelihood:,-489.72
converged:,True,LL-Null:,-482.26
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,0.0041,0.001,3.011,0.003,0.001,0.007
