In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# loading the diabetes dataset to a pandas DataFrame
pcos_lr = pd.read_csv(r"C:\Users\Muthukumar\Desktop\Final Year Project\pcos_dataset.csv")

In [3]:
# printing the first 5 rows of the dataset
pcos_lr.head()

Unnamed: 0,Age,BMI,Menstrual_Irregularity,Testosterone_Level(ng/dL),Antral_Follicle_Count,PCOS_Diagnosis
0,24,34.7,1,25.2,20,0
1,37,26.4,0,57.1,25,0
2,32,23.6,0,92.7,28,0
3,28,28.8,0,63.1,26,0
4,25,22.1,1,59.8,8,0


In [4]:
pcos_lr.shape

(1000, 6)

In [5]:
pcos_lr.describe()

Unnamed: 0,Age,BMI,Menstrual_Irregularity,Testosterone_Level(ng/dL),Antral_Follicle_Count,PCOS_Diagnosis
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,31.771,26.387,0.53,60.1595,17.469,0.199
std,8.463462,4.93554,0.499349,23.160204,7.069301,0.399448
min,18.0,18.1,0.0,20.0,5.0,0.0
25%,24.0,21.9,0.0,41.7,12.0,0.0
50%,32.0,26.4,1.0,60.0,18.0,0.0
75%,39.0,30.5,1.0,80.3,23.25,0.0
max,45.0,35.0,1.0,99.8,29.0,1.0


In [6]:
pcos_lr['PCOS_Diagnosis'].value_counts()

PCOS_Diagnosis
0    801
1    199
Name: count, dtype: int64

In [7]:
pcos_lr.groupby('PCOS_Diagnosis').mean()

Unnamed: 0_level_0,Age,BMI,Menstrual_Irregularity,Testosterone_Level(ng/dL),Antral_Follicle_Count
PCOS_Diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,32.043695,25.457928,0.413233,57.842447,16.792759
1,30.673367,30.126633,1.0,69.48593,20.190955


In [8]:
X = pcos_lr.drop(columns='PCOS_Diagnosis', axis=1)
Y = pcos_lr['PCOS_Diagnosis']

In [9]:
print(X)

     Age   BMI  Menstrual_Irregularity  Testosterone_Level(ng/dL)  \
0     24  34.7                       1                       25.2   
1     37  26.4                       0                       57.1   
2     32  23.6                       0                       92.7   
3     28  28.8                       0                       63.1   
4     25  22.1                       1                       59.8   
..   ...   ...                     ...                        ...   
995   34  18.4                       1                       95.7   
996   45  28.9                       1                       28.5   
997   37  28.3                       0                       32.4   
998   41  27.3                       0                       95.6   
999   22  21.9                       1                       78.9   

     Antral_Follicle_Count  
0                       20  
1                       25  
2                       28  
3                       26  
4                        8

In [10]:
print(Y)

0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: PCOS_Diagnosis, Length: 1000, dtype: int64


In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [12]:
print(X.shape, X_train.shape, X_test.shape)

(1000, 5) (800, 5) (200, 5)


In [13]:
model = LogisticRegression()

In [14]:
# training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

In [15]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [16]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9125


In [17]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [18]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.925


In [19]:
input_data = (37,26.4,0,57.1,25)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person does not have PCOS')
else:
  print('The person has PCOS')

[0]
The person does not have PCOS




In [20]:
from sklearn.metrics import classification_report, confusion_matrix
Y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(Y_test, Y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(Y_test, Y_pred))

Accuracy: 0.925

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.95       160
           1       0.86      0.75      0.80        40

    accuracy                           0.93       200
   macro avg       0.90      0.86      0.88       200
weighted avg       0.92      0.93      0.92       200


Confusion Matrix:
 [[155   5]
 [ 10  30]]


In [21]:
import pickle

with open('PCOS_LR.pkl', 'wb') as f:
    pickle.dump(model, f)

print("✅ Random Forest model saved as 'PCOS_LR.pkl'")

✅ Random Forest model saved as 'PCOS_LR.pkl'
