Here I am following the example from StatQuest: https://github.com/StatQuest/logistic_regression_demo/blob/master/logistic_regression_demo.R

In [118]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

In [2]:
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"

In [7]:
df = pd.read_csv(url, header=None)

In [9]:
cols = [
      "age",
  "sex",# 0 = female, 1 = male
  "cp", # chest pain
  # 1 = typical angina,
  # 2 = atypical angina,
  # 3 = non-anginal pain,
  # 4 = asymptomatic
  "trestbps", # resting blood pressure (in mm Hg)
  "chol", # serum cholestoral in mg/dl
  "fbs",  # fasting blood sugar if less than 120 mg/dl, 1 = TRUE, 0 = FALSE
  "restecg", # resting electrocardiographic results
  # 1 = normal
  # 2 = having ST-T wave abnormality
  # 3 = showing probable or definite left ventricular hypertrophy
  "thalach", # maximum heart rate achieved
  "exang",   # exercise induced angina, 1 = yes, 0 = no
  "oldpeak", # ST depression induced by exercise relative to rest
  "slope", # the slope of the peak exercise ST segment
  # 1 = upsloping
  # 2 = flat
  # 3 = downsloping
  "ca", # number of major vessels (0-3) colored by fluoroscopy
  "thal", # this is short of thalium heart scan
  # 3 = normal (no cold spots)
  # 6 = fixed defect (cold spots during rest and exercise)
  # 7 = reversible defect (when cold spots only appear during exercise)
  "hd" # (the predicted attribute) - diagnosis of heart disease
  # 0 if less than or equal to 50% diameter narrowing
  # 1 if greater than 50% diameter narrowing
]

In [11]:
df.columns = cols

In [13]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,hd
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        303 non-null    object 
 12  thal      303 non-null    object 
 13  hd        303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


In [63]:
df.dtypes

age         float64
sex          object
cp           object
trestbps    float64
chol        float64
fbs          object
restecg      object
thalach     float64
exang        object
oldpeak     float64
slope        object
ca           object
thal         object
hd           object
dtype: object

In [61]:
def rstr(df): 
    return df.shape,df.apply(lambda x: [x.unique()])

In [62]:
rstr(df)

((303, 14),
 age         [[63.0, 67.0, 37.0, 41.0, 56.0, 62.0, 57.0, 53...
 sex                                              [[1.0, 0.0]]
 cp                                     [[1.0, 4.0, 3.0, 2.0]]
 trestbps    [[145.0, 160.0, 120.0, 130.0, 140.0, 172.0, 15...
 chol        [[233.0, 286.0, 229.0, 250.0, 204.0, 236.0, 26...
 fbs                                              [[1.0, 0.0]]
 restecg                                     [[2.0, 0.0, 1.0]]
 thalach     [[150.0, 108.0, 129.0, 187.0, 172.0, 178.0, 16...
 exang                                            [[0.0, 1.0]]
 oldpeak     [[2.3, 1.5, 2.6, 3.5, 1.4, 0.8, 3.6, 0.6, 3.1,...
 slope                                       [[3.0, 2.0, 1.0]]
 ca                                [[0.0, 3.0, 2.0, 1.0, nan]]
 thal                                   [[6.0, 3.0, 7.0, nan]]
 hd                                          [[0, 2, 1, 3, 4]]
 dtype: object)

In [28]:
df = df.replace('?',np.nan)# not many, we could actually drop them

In [67]:
rstr(df)

((297, 14),
 age         [[63.0, 67.0, 37.0, 41.0, 56.0, 62.0, 57.0, 53...
 sex                                              [[1.0, 0.0]]
 cp                                     [[1.0, 4.0, 3.0, 2.0]]
 trestbps    [[145.0, 160.0, 120.0, 130.0, 140.0, 172.0, 15...
 chol        [[233.0, 286.0, 229.0, 250.0, 204.0, 236.0, 26...
 fbs                                              [[1.0, 0.0]]
 restecg                                     [[2.0, 0.0, 1.0]]
 thalach     [[150.0, 108.0, 129.0, 187.0, 172.0, 178.0, 16...
 exang                                            [[0.0, 1.0]]
 oldpeak     [[2.3, 1.5, 2.6, 3.5, 1.4, 0.8, 3.6, 0.6, 3.1,...
 slope                                       [[3.0, 2.0, 1.0]]
 ca                                     [[0.0, 3.0, 2.0, 1.0]]
 thal                                        [[6.0, 3.0, 7.0]]
 hd                                          [[0, 2, 1, 3, 4]]
 dtype: object)

In [64]:
df.sex = df.sex.astype(object)
df.cp = df.cp.astype(object)
df.fbs = df.fbs.astype(object)
df.restecg = df.restecg.astype(object)
df.exang = df.exang.astype(object)
df.slope = df.slope.astype(object)
#df.ca & df.thal are already objects/factors
df.hd = df.hd.astype(object)



In [72]:
def healthy(num):
    if num == '0':
        return 'Healthy'
    else: 
        return 'Unhealthy'

In [74]:
df.hd = df.hd.apply(healthy)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [75]:
df.hd

0        Healthy
1      Unhealthy
2      Unhealthy
3        Healthy
4        Healthy
         ...    
297    Unhealthy
298    Unhealthy
299    Unhealthy
300    Unhealthy
301    Unhealthy
Name: hd, Length: 297, dtype: object

In [66]:
df = df.dropna()

In [68]:
len(df)

297

In [77]:
rstr(df)

((297, 14),
 age         [[63.0, 67.0, 37.0, 41.0, 56.0, 62.0, 57.0, 53...
 sex                                              [[1.0, 0.0]]
 cp                                     [[1.0, 4.0, 3.0, 2.0]]
 trestbps    [[145.0, 160.0, 120.0, 130.0, 140.0, 172.0, 15...
 chol        [[233.0, 286.0, 229.0, 250.0, 204.0, 236.0, 26...
 fbs                                              [[1.0, 0.0]]
 restecg                                     [[2.0, 0.0, 1.0]]
 thalach     [[150.0, 108.0, 129.0, 187.0, 172.0, 178.0, 16...
 exang                                            [[0.0, 1.0]]
 oldpeak     [[2.3, 1.5, 2.6, 3.5, 1.4, 0.8, 3.6, 0.6, 3.1,...
 slope                                       [[3.0, 2.0, 1.0]]
 ca                                     [[0.0, 3.0, 2.0, 1.0]]
 thal                                        [[6.0, 3.0, 7.0]]
 hd                                     [[Healthy, Unhealthy]]
 dtype: object)

In [79]:
pd.crosstab(df.hd,df.sex)

sex,0.0,1.0
hd,Unnamed: 1_level_1,Unnamed: 2_level_1
Healthy,71,89
Unhealthy,25,112


In [80]:
pd.crosstab(df.hd,df.cp)

cp,1.0,2.0,3.0,4.0
hd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Healthy,16,40,65,39
Unhealthy,7,9,18,103


In [82]:
pd.crosstab(df.hd,df.fbs)

fbs,0.0,1.0
hd,Unnamed: 1_level_1,Unnamed: 2_level_1
Healthy,137,23
Unhealthy,117,20


In [83]:
pd.crosstab(df.hd,df.restecg)

restecg,0.0,1.0,2.0
hd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Healthy,92,1,67
Unhealthy,55,3,79


In [116]:
def rstr(df):
    print(f"'data.frame': {df.shape[0]} obs. of {df.shape[1]} variables")
    cols = list(df.keys())
    for col in cols:
        print(col, df[col].dtypes, df[col].unique()[:5])
    

In [119]:
rstr(df)

'data.frame': 297 obs. of 14 variables
age float64 [63. 67. 37. 41. 56.]
sex object [1.0 0.0]
cp object [1.0 4.0 3.0 2.0]
trestbps float64 [145. 160. 120. 130. 140.]
chol float64 [233. 286. 229. 250. 204.]
fbs object [1.0 0.0]
restecg object [2.0 0.0 1.0]
thalach float64 [150. 108. 129. 187. 172.]
exang object [0.0 1.0]
oldpeak float64 [2.3 1.5 2.6 3.5 1.4]
slope object [3.0 2.0 1.0]
ca object ['0.0' '3.0' '2.0' '1.0']
thal object ['6.0' '3.0' '7.0']
hd object ['Healthy' 'Unhealthy']


In [148]:
clf = LogisticRegression()

In [155]:
y = df.hd.values
X = df.sex.values.reshape(-1, 1)
#X.shape
#np.array(X).shape
#np.array(y).shape

In [156]:
clf.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

But honestly, it's quite challenging to get a similar output like in R. Here I would like to see the summary. This needs a workaround.

maybe we try:
- http://blog.yhat.com/tutorials/rpy2-combing-the-power-of-r-and-python.html
- https://sites.google.com/site/aslugsguidetopython/data-analysis/pandas/calling-r-from-python