In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [2]:
file_path = 'medical_conditions_dataset.csv'  # Update with your file path
data = pd.read_csv(file_path)
data

Unnamed: 0,id,full_name,age,gender,smoking_status,bmi,blood_pressure,glucose_levels,condition
0,1,User0001,,male,Non-Smoker,,,,Pneumonia
1,2,User0002,30.0,male,Non-Smoker,,105.315064,,Diabetic
2,3,User0003,18.0,male,Non-Smoker,35.612486,,,Pneumonia
3,4,User0004,,male,Non-Smoker,,99.119829,,Pneumonia
4,5,User0005,76.0,male,Non-Smoker,,,,Diabetic
...,...,...,...,...,...,...,...,...,...
9995,9996,User9996,,male,Non-Smoker,25.029002,152.540355,137.551451,Pneumonia
9996,9997,User9997,,male,Non-Smoker,27.017487,,,Diabetic
9997,9998,User9998,23.0,male,Smoker,,148.833321,173.931480,Pneumonia
9998,9999,User9999,,female,Non-Smoker,,,,Pneumonia


In [3]:
data_cleaned = data.fillna(data.mean(numeric_only=True))
data_cleaned

Unnamed: 0,id,full_name,age,gender,smoking_status,bmi,blood_pressure,glucose_levels,condition
0,1,User0001,53.541598,male,Non-Smoker,27.423420,135.209429,135.219608,Pneumonia
1,2,User0002,30.000000,male,Non-Smoker,27.423420,105.315064,135.219608,Diabetic
2,3,User0003,18.000000,male,Non-Smoker,35.612486,135.209429,135.219608,Pneumonia
3,4,User0004,53.541598,male,Non-Smoker,27.423420,99.119829,135.219608,Pneumonia
4,5,User0005,76.000000,male,Non-Smoker,27.423420,135.209429,135.219608,Diabetic
...,...,...,...,...,...,...,...,...,...
9995,9996,User9996,53.541598,male,Non-Smoker,25.029002,152.540355,137.551451,Pneumonia
9996,9997,User9997,53.541598,male,Non-Smoker,27.017487,135.209429,135.219608,Diabetic
9997,9998,User9998,23.000000,male,Smoker,27.423420,148.833321,173.931480,Pneumonia
9998,9999,User9999,53.541598,female,Non-Smoker,27.423420,135.209429,135.219608,Pneumonia


In [4]:
label_encoder = LabelEncoder()
categorical_columns = ['gender', 'smoking_status', 'condition']  # Update with relevant columns

for col in categorical_columns:
    data_cleaned[col] = label_encoder.fit_transform(data_cleaned[col])
data_cleaned


Unnamed: 0,id,full_name,age,gender,smoking_status,bmi,blood_pressure,glucose_levels,condition
0,1,User0001,53.541598,1,0,27.423420,135.209429,135.219608,2
1,2,User0002,30.000000,1,0,27.423420,105.315064,135.219608,1
2,3,User0003,18.000000,1,0,35.612486,135.209429,135.219608,2
3,4,User0004,53.541598,1,0,27.423420,99.119829,135.219608,2
4,5,User0005,76.000000,1,0,27.423420,135.209429,135.219608,1
...,...,...,...,...,...,...,...,...,...
9995,9996,User9996,53.541598,1,0,25.029002,152.540355,137.551451,2
9996,9997,User9997,53.541598,1,0,27.017487,135.209429,135.219608,1
9997,9998,User9998,23.000000,1,1,27.423420,148.833321,173.931480,2
9998,9999,User9999,53.541598,0,0,27.423420,135.209429,135.219608,2


In [5]:
numeric_columns = ['age', 'bmi', 'blood_pressure', 'glucose_levels']  # Update with numeric columns
scaler = StandardScaler()
data_cleaned[numeric_columns] = scaler.fit_transform(data_cleaned[numeric_columns])
data_cleaned

Unnamed: 0,id,full_name,age,gender,smoking_status,bmi,blood_pressure,glucose_levels,condition
0,1,User0001,0.000000,1,0,0.000000,1.778694e-15,-1.095971e-15,2
1,2,User0002,-1.524787,1,0,0.000000,-1.870856e+00,-1.095971e-15,1
2,3,User0003,-2.302025,1,0,1.660533,1.778694e-15,-1.095971e-15,2
3,4,User0004,0.000000,1,0,0.000000,-2.258567e+00,-1.095971e-15,2
4,5,User0005,1.454628,1,0,0.000000,1.778694e-15,-1.095971e-15,1
...,...,...,...,...,...,...,...,...,...
9995,9996,User9996,0.000000,1,0,-0.485526,1.084608e+00,8.991828e-02,2
9996,9997,User9997,0.000000,1,0,-0.082313,1.778694e-15,-1.095971e-15,1
9997,9998,User9998,-1.978176,1,1,0.000000,8.526135e-01,1.492770e+00,2
9998,9999,User9999,0.000000,0,0,0.000000,1.778694e-15,-1.095971e-15,2


In [6]:
data_cleaned.isnull().sum()

id                0
full_name         0
age               0
gender            0
smoking_status    0
bmi               0
blood_pressure    0
glucose_levels    0
condition         0
dtype: int64

In [7]:
X = data_cleaned.drop(columns=['condition', 'full_name']) # Replace 'condition' with your target column
y = data_cleaned['condition']
X



Unnamed: 0,id,age,gender,smoking_status,bmi,blood_pressure,glucose_levels
0,1,0.000000,1,0,0.000000,1.778694e-15,-1.095971e-15
1,2,-1.524787,1,0,0.000000,-1.870856e+00,-1.095971e-15
2,3,-2.302025,1,0,1.660533,1.778694e-15,-1.095971e-15
3,4,0.000000,1,0,0.000000,-2.258567e+00,-1.095971e-15
4,5,1.454628,1,0,0.000000,1.778694e-15,-1.095971e-15
...,...,...,...,...,...,...,...
9995,9996,0.000000,1,0,-0.485526,1.084608e+00,8.991828e-02
9996,9997,0.000000,1,0,-0.082313,1.778694e-15,-1.095971e-15
9997,9998,-1.978176,1,1,0.000000,8.526135e-01,1.492770e+00
9998,9999,0.000000,0,0,0.000000,1.778694e-15,-1.095971e-15


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

Unnamed: 0,id,age,gender,smoking_status,bmi,blood_pressure,glucose_levels
9254,9255,0.000000,1,0,0.000000,1.778694e-15,1.950133e-01
1561,1562,0.000000,1,1,2.499041,1.778694e-15,-1.100115e+00
1670,1671,0.000000,1,1,1.694279,1.778694e-15,-1.095971e-15
6087,6088,0.000000,1,0,0.000000,1.778694e-15,-1.095971e-15
6669,6670,0.000000,1,0,0.000000,-1.803147e+00,-1.095971e-15
...,...,...,...,...,...,...,...
5734,5735,-0.682778,0,0,0.000000,-2.359360e+00,-1.095971e-15
5191,5192,1.001239,1,1,0.000000,1.778694e-15,-1.973339e+00
5390,5391,-0.035079,1,0,0.000000,1.778694e-15,1.379394e+00
860,861,0.677390,1,0,0.000000,1.778694e-15,4.731224e-01


In [9]:
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
y_pred = logistic_model.predict(X_test)

# Step 8: Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")



Accuracy: 0.61
