In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector as selector

In [3]:
data = pd.read_csv("/content/final_dataset.csv")

In [4]:
data.describe()

Unnamed: 0,Age,Alcohol (units/week),Smoking (cigarettes/day),Stress Level (1-10),Sleep (hours/night),Cholesterol (mg/dL),BMI,Blood Sugar (mg/dL),Heart Rate (bpm)
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,48.3151,4.9744,9.9938,5.5343,6.4988,224.707,28.4885,184.765,79.7758
std,17.934369,3.179049,6.074459,2.86193,1.707656,43.60706,6.339234,66.352452,11.779695
min,18.0,0.0,0.0,1.0,4.0,150.0,18.0,70.0,60.0
25%,33.0,2.0,5.0,3.0,5.0,187.0,23.0,127.0,70.0
50%,48.0,5.0,10.0,6.0,6.0,225.0,28.0,185.0,80.0
75%,64.0,8.0,15.0,8.0,8.0,262.0,34.0,242.0,90.0
max,79.0,10.0,20.0,10.0,9.0,300.0,39.0,300.0,100.0


In [5]:
data.shape

(10000, 20)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Age                                10000 non-null  int64 
 1   Gender                             10000 non-null  object
 2   Family History of Chronic Disease  10000 non-null  object
 3   Diet                               10000 non-null  object
 4   Physical Activity                  10000 non-null  object
 5   Alcohol (units/week)               10000 non-null  int64 
 6   Smoking (cigarettes/day)           10000 non-null  int64 
 7   Stress Level (1-10)                10000 non-null  int64 
 8   Blood Pressure (mmHg)              10000 non-null  object
 9   Sleep (hours/night)                10000 non-null  int64 
 10  Cholesterol (mg/dL)                10000 non-null  int64 
 11  Past Medical Condition             10000 non-null  object
 12  BMI  

In [7]:
data.isnull().sum()

Age                                  0
Gender                               0
Family History of Chronic Disease    0
Diet                                 0
Physical Activity                    0
Alcohol (units/week)                 0
Smoking (cigarettes/day)             0
Stress Level (1-10)                  0
Blood Pressure (mmHg)                0
Sleep (hours/night)                  0
Cholesterol (mg/dL)                  0
Past Medical Condition               0
BMI                                  0
Ethnicity/Race                       0
Blood Sugar (mg/dL)                  0
Heart Rate (bpm)                     0
Work Status                          0
Marital Status                       0
Place of Residence                   0
Chronic Disease Type                 0
dtype: int64

In [8]:
X = data.drop(['Chronic Disease Type'], axis=1)
y = data['Chronic Disease Type']

In [9]:
numeric_features = ['Age', 'Alcohol (units/week)', 'Smoking (cigarettes/day)', 'Stress Level (1-10)',
                     'Blood Pressure (mmHg)', 'Sleep (hours/night)', 'Cholesterol (mg/dL)', 'BMI',
                     'Blood Sugar (mg/dL)', 'Heart Rate (bpm)']
categorical_features = ['Gender', 'Family History of Chronic Disease', 'Diet', 'Physical Activity',
                         'Past Medical Condition', 'Ethnicity/Race', 'Type of Work', 'Education Level',
                         'Work Status', 'Marital Status', 'Place of Residence']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, selector(dtype_exclude="object")),
        ('cat', categorical_transformer, selector(dtype_include="object"))
    ]
)

In [10]:
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Create a pipeline that includes preprocessing and the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Train the model
pipeline.fit(X, y)

In [12]:
patient_data = {}
for column in X.columns:
        while True:
            value = input(f"Enter patient's {column}: ")
            if column == 'Alcohol (units/week)' and value.isdigit() and 1 <= int(value) <= 10:
                patient_data[column] = value
                break
            elif column == 'Sleep (hours/night)' and value.isdigit() and int(value) <= 15:
                patient_data[column] = value
                break
            elif column == 'Smoking (cigarettes/day)' and value.isdigit() and 1 <= int(value) <= 20:
                patient_data[column] = value
                break
            elif column == 'Stress Level (1-10)' and value.isdigit() and 1 <= int(value) <= 10:
                patient_data[column] = value
                break
            elif column == 'Cholesterol (mg/dL)' and value.isdigit() and int(value) <=300:
                patient_data[column] = value
                break
            elif column == 'BMI' and value.isdigit() and int(value) <=100:
                patient_data[column] = value
                break
            elif column == 'Blood Sugar (mg/dL)' and value.isdigit() and int(value) <=300:
                patient_data[column] = value
                break
            elif column == 'Heart Rate (bpm)' and value.isdigit() and int(value) < 110:
                patient_data[column] = value
                break
            elif column == 'Age' and value.isdigit() and int(value)>=18 and int(value)<=80:
                patient_data[column] = value
                break
            if value in data[column].unique():
                if value not in patient_data.values():  # Check if the value is not already in patient_data
                    patient_data[column] = value
                    break
                else:
                    print("This value has already been entered for another feature. Please enter a different value.")
            else:
                 print("Invalid input. Please enter a valid value.")
                 unique_values = sorted(data[column].unique())
                 print(f"Valid values for {column}: {', '.join(map(str, unique_values))}")


Enter patient's Age: 23
Enter patient's Gender: M
Invalid input. Please enter a valid value.
Valid values for Gender: Female, Male
Enter patient's Gender: Male
Enter patient's Family History of Chronic Disease: om
Invalid input. Please enter a valid value.
Valid values for Family History of Chronic Disease: No, Yes
Enter patient's Family History of Chronic Disease: Yes
Enter patient's Diet: UHJ
Invalid input. Please enter a valid value.
Valid values for Diet: Balanced, High fat, High protein, Low Carb
Enter patient's Diet: Balanced
Enter patient's Physical Activity: iks
Invalid input. Please enter a valid value.
Valid values for Physical Activity: Moderate (3 times/week), Sedentary, Vigorous (5 times/week)
Enter patient's Physical Activity: Sedentary
Enter patient's Alcohol (units/week): 4
Enter patient's Smoking (cigarettes/day): 5
Enter patient's Stress Level (1-10): 7
Enter patient's Blood Pressure (mmHg): 119/70
Enter patient's Sleep (hours/night): 5
Enter patient's Cholesterol (mg

In [19]:
# Create a DataFrame from patient_data
patient_df = pd.DataFrame(patient_data, index=[0])

# Make predictions for the new patient
predictions = pipeline.predict(patient_df)

# Print the predicted diseases
print("Predicted Diseases:")
for disease in predictions:
    print("-", disease)






Predicted Diseases:
- Depression
