In [17]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [18]:
data = pd.read_csv("osteoporosisfinal.csv")
print("Original Data:")
print(data.head())

Original Data:
        Id  Age  Gender         Hormone FHistory              Race  \
0  1734616   69  Female          Normal      Yes             Asian   
1  1419098   32  Female          Normal      Yes             Asian   
2  1797916   89  Female  Postmenopausal       No         Caucasian   
3  1805337   78  Female          Normal       No         Caucasian   
4  1351334   38    Male  Postmenopausal      Yes  African American   

        Weight CalciumIn   Activity Smoking          MedCondition  \
0  Underweight       Low  Sedentary     Yes  Rheumatoid Arthritis   
1  Underweight       Low  Sedentary      No                  None   
2       Normal  Adequate     Active      No       Hyperthyroidism   
3  Underweight  Adequate  Sedentary     Yes  Rheumatoid Arthritis   
4       Normal       Low     Active     Yes  Rheumatoid Arthritis   

       Medications Fractures  Osteoporosis  
0  Corticosteroids       Yes             1  
1             None       Yes             1  
2  Corticoster

In [19]:
data_final = data.drop(['Id', "Race", "Medications","CalciumIn"], axis=1)
data_final = data_final.dropna()
print("\nCleaned Data:")
print(data_final.head())


Cleaned Data:
   Age  Gender         Hormone FHistory       Weight   Activity Smoking  \
0   69  Female          Normal      Yes  Underweight  Sedentary     Yes   
1   32  Female          Normal      Yes  Underweight  Sedentary      No   
2   89  Female  Postmenopausal       No       Normal     Active      No   
3   78  Female          Normal       No  Underweight  Sedentary     Yes   
4   38    Male  Postmenopausal      Yes       Normal     Active     Yes   

           MedCondition Fractures  Osteoporosis  
0  Rheumatoid Arthritis       Yes             1  
1                  None       Yes             1  
2       Hyperthyroidism        No             1  
3  Rheumatoid Arthritis        No             1  
4  Rheumatoid Arthritis       Yes             1  


In [20]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [21]:
for col in ['Gender','Hormone','FHistory','Weight','Activity','Smoking','MedCondition','Fractures']:
    data_final[col] = le.fit_transform(data_final[col])

In [22]:
print("\nEncoded Data:")
print(data_final.head())


Encoded Data:
   Age  Gender  Hormone  FHistory  Weight  Activity  Smoking  MedCondition  \
0   69       0        0         1       1         1        1             2   
1   32       0        0         1       1         1        0             1   
2   89       0        1         0       0         0        0             0   
3   78       0        0         0       1         1        1             2   
4   38       1        1         1       0         0        1             2   

   Fractures  Osteoporosis  
0          1             1  
1          1             1  
2          0             1  
3          0             1  
4          1             1  


In [23]:
X = data_final.iloc[:, :-1].values
y = data_final.iloc[:, -1].values

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)


In [25]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

results = {}

In [26]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
logm = LogisticRegression(max_iter=1000)
logm.fit(X_train, y_train)
logm_pred = logm.predict(X_test)
results["Logistic Regression"] = accuracy_score(y_test, logm_pred)

In [27]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
results["KNN"] = accuracy_score(y_test, knn_pred)

In [28]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)
results["Naive Bayes"] = accuracy_score(y_test, nb_pred)


In [29]:
from sklearn.svm import SVC
svm_model = SVC(kernel="linear")
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
results["SVM"] = accuracy_score(y_test, svm_pred)

In [30]:
print("\nModel Accuracies:")
for model, acc in results.items():
    print(f"{model}: {acc:.2f}")


Model Accuracies:
Logistic Regression: 0.83
KNN: 0.89
Naive Bayes: 0.87
SVM: 0.86


In [31]:
best_model = knn

In [32]:
pickle.dump(best_model, open("model.pkl", "wb"))
print("\n Model saved as model.pkl")


 Model saved as model.pkl
