# Logistic Regresstion with One Hot Replacement 
Simple regression with a few OHR insertions. 

In [1]:
# Load Libraries 
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import sklearn

In [2]:
# Print environment 
print("Python Version: ", sys.version)
print("Pandas Version: ", pd.__version__)
print("Sklearn Version: ", sklearn.__version__)

Python Version:  3.6.9 (default, Mar 10 2023, 16:46:00) 
[GCC 8.4.0]
Pandas Version:  1.1.5
Sklearn Version:  0.24.2


In [3]:
# load your data 
fileIn = "/mnt/thumb/LLM_Stuff/data/heart_disease_uci.csv"
df = pd.read_csv(fileIn)
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [4]:
# Replace values greater than 0 with 1 in column 'num'
df.loc[df['num'] > 0, 'num'] = 1
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,1
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [5]:
# Rename 'num' to target 
df = df.rename(columns={'num': 'target'})
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,target
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,1
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [6]:
# Drop NAN
df.dropna(inplace=True)

In [7]:
# Denominate independent and dependent features 
training_features = ['age', 'sex', 'trestbps', 'chol']
X = df[training_features]  
y = df['target']  

In [8]:
# Check for '?' 
sex_list = set(df['sex'].tolist())
print(sex_list)

{'Female', 'Male'}


In [10]:
# Make the Dummies 
df = pd.get_dummies(df, columns = ['sex'])
df 

Unnamed: 0,id,age,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,target,sex_Female,sex_Male
0,1,63,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0,0,1
1,2,67,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,1,0,1
2,3,67,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1,0,1
3,4,37,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0,0,1
4,5,41,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,300,68,Cleveland,asymptomatic,144.0,193.0,True,normal,141.0,False,3.4,flat,2.0,reversable defect,1,0,1
300,301,57,Cleveland,asymptomatic,130.0,131.0,False,normal,115.0,True,1.2,flat,1.0,reversable defect,1,0,1
301,302,57,Cleveland,atypical angina,130.0,236.0,False,lv hypertrophy,174.0,False,0.0,flat,1.0,normal,1,1,0
508,509,47,Hungary,asymptomatic,150.0,226.0,False,normal,98.0,True,1.5,flat,0.0,reversable defect,1,0,1


In [11]:
# Denominate independent and dependent features 
training_features = ['age', 'sex_Male', 'trestbps', 'chol']
X = df[training_features]  
y = df['target']  

In [12]:
# split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [13]:
# Train yoour model 
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [14]:
# make predictions 
y_pred = model.predict(X_test)

In [15]:
# review results 
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", report)

Accuracy: 0.65
Confusion Matrix:
 [[22 10]
 [11 17]]
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.69      0.68        32
           1       0.63      0.61      0.62        28

    accuracy                           0.65        60
   macro avg       0.65      0.65      0.65        60
weighted avg       0.65      0.65      0.65        60

