# Logistic Regresstion with One Hot Replacement 
Simple regression with a few OHR insertions. 

In [1]:
# Load Libraries 
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import sklearn

In [2]:
# Print environment 
print("Python Version: ", sys.version)
print("Pandas Version: ", pd.__version__)
print("Sklearn Version: ", sklearn.__version__)

Python Version:  3.6.9 (default, Mar 10 2023, 16:46:00) 
[GCC 8.4.0]
Pandas Version:  1.1.5
Sklearn Version:  0.24.2


In [3]:
# load your data 
fileIn = "/mnt/thumb/LLM_Stuff/data/heart_disease_uci.csv"
df = pd.read_csv(fileIn)
# df.head()

In [4]:
# Replace values greater than 0 with 1 in column 'num'
df.loc[df['num'] > 0, 'num'] = 1
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,1
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [5]:
# Rename 'num' to target 
df = df.rename(columns={'num': 'target'})
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,target
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,1
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [6]:
# Drop NAN
df.dropna(inplace=True)

In [7]:
# Make the Dummies 
df = pd.get_dummies(df, columns = ['sex', 'cp', 'fbs', 'exang'])
df.head() 

Unnamed: 0,id,age,dataset,trestbps,chol,restecg,thalch,oldpeak,slope,ca,...,sex_Female,sex_Male,cp_asymptomatic,cp_atypical angina,cp_non-anginal,cp_typical angina,fbs_False,fbs_True,exang_False,exang_True
0,1,63,Cleveland,145.0,233.0,lv hypertrophy,150.0,2.3,downsloping,0.0,...,0,1,0,0,0,1,0,1,1,0
1,2,67,Cleveland,160.0,286.0,lv hypertrophy,108.0,1.5,flat,3.0,...,0,1,1,0,0,0,1,0,0,1
2,3,67,Cleveland,120.0,229.0,lv hypertrophy,129.0,2.6,flat,2.0,...,0,1,1,0,0,0,1,0,0,1
3,4,37,Cleveland,130.0,250.0,normal,187.0,3.5,downsloping,0.0,...,0,1,0,0,1,0,1,0,1,0
4,5,41,Cleveland,130.0,204.0,lv hypertrophy,172.0,1.4,upsloping,0.0,...,1,0,0,1,0,0,1,0,1,0


In [None]:
df.columns

In [None]:
# Denominate independent and dependent features 
training_features = ['age', 'sex_Male', 'cp_asymptomatic', 'cp_atypical angina', 'cp_non-anginal', 'cp_typical angina', 'fbs_True', 'trestbps', 'chol', 'exang_True']
X = df[training_features]  
y = df['target']  

In [None]:
# split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
# Train yoour model 
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

In [None]:
# make predictions 
y_pred = model.predict(X_test)

In [None]:
# review results 
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", report)