In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
DATA_DIR = '../data/cleaned_data.csv'
df = pd.read_csv(DATA_DIR)
df.head()

Unnamed: 0,Country,Age,BMI,Menstrual Regularity,Hirsutism,Acne Severity,Family History of PCOS,Insulin Resistance,Lifestyle Score,Stress Levels,Urban/Rural,Socioeconomic Status,Awareness of PCOS,Fertility Concerns,Undiagnosed PCOS Likelihood,Ethnicity,Diagnosis
0,Madagascar,26,Overweight,Regular,Yes,Severe,Yes,Yes,2,Low,Rural,High,Yes,No,0.107938,Hispanic,Yes
1,Vietnam,16,Underweight,Regular,Yes,,No,Yes,4,High,Rural,Middle,Yes,No,0.156729,Other,No
2,Somalia,41,Normal,Regular,No,Moderate,No,No,7,Medium,Urban,Middle,Yes,Yes,0.202901,Other,No
3,Malawi,27,Normal,Irregular,No,Mild,No,No,10,Low,Urban,High,Yes,No,0.073926,Caucasian,Yes
4,France,26,Overweight,Irregular,Yes,,No,No,7,Medium,Urban,Middle,No,No,0.229266,Caucasian,No


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 17 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Country                      120000 non-null  object 
 1   Age                          120000 non-null  int64  
 2   BMI                          120000 non-null  object 
 3   Menstrual Regularity         120000 non-null  object 
 4   Hirsutism                    120000 non-null  object 
 5   Acne Severity                59915 non-null   object 
 6   Family History of PCOS       120000 non-null  object 
 7   Insulin Resistance           120000 non-null  object 
 8   Lifestyle Score              120000 non-null  int64  
 9   Stress Levels                120000 non-null  object 
 10  Urban/Rural                  120000 non-null  object 
 11  Socioeconomic Status         120000 non-null  object 
 12  Awareness of PCOS            120000 non-null  object 
 13 

In [4]:
df.columns = [col.replace(' ','_') for col in df.columns]

In [5]:
df.isna().sum()

Country                            0
Age                                0
BMI                                0
Menstrual_Regularity               0
Hirsutism                          0
Acne_Severity                  60085
Family_History_of_PCOS             0
Insulin_Resistance                 0
Lifestyle_Score                    0
Stress_Levels                      0
Urban/Rural                        0
Socioeconomic_Status               0
Awareness_of_PCOS                  0
Fertility_Concerns                 0
Undiagnosed_PCOS_Likelihood        0
Ethnicity                          0
Diagnosis                          0
dtype: int64

In [6]:
df['Acne_Severity'].fillna('NA',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Acne_Severity'].fillna('NA',inplace=True)


In [7]:
df['Lifestyle_Score'].unique()

array([ 2,  4,  7, 10,  3,  8,  1,  5,  9,  6])

In [8]:
df.isna().sum()

Country                        0
Age                            0
BMI                            0
Menstrual_Regularity           0
Hirsutism                      0
Acne_Severity                  0
Family_History_of_PCOS         0
Insulin_Resistance             0
Lifestyle_Score                0
Stress_Levels                  0
Urban/Rural                    0
Socioeconomic_Status           0
Awareness_of_PCOS              0
Fertility_Concerns             0
Undiagnosed_PCOS_Likelihood    0
Ethnicity                      0
Diagnosis                      0
dtype: int64

In [9]:
df = df.drop(columns=['Undiagnosed_PCOS_Likelihood'])

In [10]:
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    print("*"*10,col,"*"*10)
    print(df[col].unique())
    print()

********** Country **********
['Madagascar' 'Vietnam' 'Somalia' 'Malawi' 'France' 'Rwanda' 'Tanzania'
 'United States' 'Italy' 'Australia' 'India' 'Argentina' 'Morocco'
 'Zambia' 'Romania' 'Sudan' 'Benin' 'Burkina Faso' 'Nepal' 'Mali'
 'Malaysia' 'Chile' 'Mozambique' 'Ivory Coast' 'Taiwan' 'Nigeria'
 'Zimbabwe' 'Uzbekistan' 'Germany' 'Indonesia' 'Egypt' 'Russia' 'Chad'
 'Peru' 'Bangladesh' 'Iraq' 'Canada' 'Cameroon' 'Brazil' 'North Korea'
 'Kazakhstan' 'Uganda' 'Guinea' 'Yemen' 'Saudi Arabia' 'South Korea'
 'Afghanistan' 'Spain' 'Ghana' 'Guatemala' 'China' 'Japan' 'Pakistan'
 'Kenya' 'Ethiopia' 'South Africa' 'Poland' 'Colombia' 'Burundi'
 'Venezuela' 'Philippines' 'Ukraine' 'Ecuador' 'Sri Lanka' 'Cambodia'
 'Niger' 'Thailand' 'Netherlands' 'Iran' 'Senegal' 'Turkey'
 'United Kingdom' 'Syria' 'Algeria' 'Myanmar' 'Angola' 'Mexico']

********** BMI **********
['Overweight' 'Underweight' 'Normal' 'Obese']

********** Menstrual_Regularity **********
['Regular' 'Irregular']

********** Hirsu

In [11]:
X = df.drop(columns = ['Diagnosis'])
y = df['Diagnosis']

In [12]:
cat_cols

Index(['Country', 'BMI', 'Menstrual_Regularity', 'Hirsutism', 'Acne_Severity',
       'Family_History_of_PCOS', 'Insulin_Resistance', 'Stress_Levels',
       'Urban/Rural', 'Socioeconomic_Status', 'Awareness_of_PCOS',
       'Fertility_Concerns', 'Ethnicity', 'Diagnosis'],
      dtype='object')

In [13]:
num_cols = X.select_dtypes(include = 'number').columns.to_list()
ordinal_cols = ['BMI','Acne_Severity','Stress_Levels','Socioeconomic_Status']
nominal_cols = []
for col in cat_cols[:-1]:
    if col not in ordinal_cols:
        nominal_cols.append(col)

print("Numerical_Columns : ",num_cols)
print("Ordinal Columns : ",ordinal_cols)
print("Nominal Columns : ",nominal_cols)

Numerical_Columns :  ['Age', 'Lifestyle_Score']
Ordinal Columns :  ['BMI', 'Acne_Severity', 'Stress_Levels', 'Socioeconomic_Status']
Nominal Columns :  ['Country', 'Menstrual_Regularity', 'Hirsutism', 'Family_History_of_PCOS', 'Insulin_Resistance', 'Urban/Rural', 'Awareness_of_PCOS', 'Fertility_Concerns', 'Ethnicity']


In [14]:

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,stratify=y)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(84000, 15) (84000,)
(36000, 15) (36000,)


In [15]:
transformer = ColumnTransformer([
    ('Numerical_pipeline',StandardScaler(),num_cols),
    ('Nominal_cols',OneHotEncoder(),nominal_cols),
    ('Ordinal_cols',OrdinalEncoder(),ordinal_cols),
],remainder='passthrough')

In [16]:
transformer.fit(X_train)

In [17]:
X_train_tr = transformer.transform(X_train)
X_test_tr = transformer.transform(X_test)

In [18]:
smote = SMOTE(random_state=42)
X_train_sm,y_train_sm = smote.fit_resample(X_train_tr,y_train)


In [19]:
y_train_sm.value_counts()

Diagnosis
No     75183
Yes    75183
Name: count, dtype: int64

In [20]:
# Define a list of classification models to compare
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    # 'Support Vector Machine': SVC(probability=True),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    # Train the model
    print(f"Training {name} model..")
    model.fit(X_train_tr, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_tr)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Store results
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

# Display results
print("Model Performance:")
for name, metrics in results.items():
    print(f"{name}:")
    print(f"  Accuracy: {metrics['Accuracy']:.4f}")
    print(f"  Precision: {metrics['Precision']:.4f}")
    print(f"  Recall: {metrics['Recall']:.4f}")
    print(f"  F1 Score: {metrics['F1 Score']:.4f}")
    print()

# Select the best model based on F1 score (higher is better)
best_model_name = max(results, key=lambda x: results[x]['F1 Score'])
best_model_metrics = results[best_model_name]
print(f"Best Model: {best_model_name}")
print(f"  Accuracy: {best_model_metrics['Accuracy']:.4f}")
print(f"  Precision: {best_model_metrics['Precision']:.4f}")
print(f"  Recall: {best_model_metrics['Recall']:.4f}")
print(f"  F1 Score: {best_model_metrics['F1 Score']:.4f}")

Training Logistic Regression model..


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training Decision Tree model..
Training Random Forest model..
Training Gradient Boosting model..


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training K-Nearest Neighbors model..
Model Performance:
Logistic Regression:
  Accuracy: 0.8950
  Precision: 0.8011
  Recall: 0.8950
  F1 Score: 0.8454

Decision Tree:
  Accuracy: 0.7992
  Precision: 0.8122
  Recall: 0.7992
  F1 Score: 0.8055

Random Forest:
  Accuracy: 0.8948
  Precision: 0.8106
  Recall: 0.8948
  F1 Score: 0.8454

Gradient Boosting:
  Accuracy: 0.8950
  Precision: 0.8011
  Recall: 0.8950
  F1 Score: 0.8454

K-Nearest Neighbors:
  Accuracy: 0.8877
  Precision: 0.8092
  Recall: 0.8877
  F1 Score: 0.8430

Best Model: Logistic Regression
  Accuracy: 0.8950
  Precision: 0.8011
  Recall: 0.8950
  F1 Score: 0.8454


In [None]:
# # Define the RandomForestClassifier
# rf_classifier = RandomForestClassifier(random_state=42)

# # Define the hyperparameter grid to search
# param_grid = {
#     'n_estimators': [100, 200, 300],  # Number of trees in the forest
#     'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
#     'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
#     'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at each leaf node
#     'bootstrap': [True, False]  # Whether bootstrap samples are used
# }

# # Set up GridSearchCV
# grid_search = GridSearchCV(
#     estimator=rf_classifier,
#     param_grid=param_grid,
#     scoring='accuracy',  # Metric to evaluate
#     cv=5,  # Number of cross-validation folds
#     verbose=1,  # Verbosity level
#     n_jobs=-1  # Use all available CPU cores
# )

# # Perform the grid search
# grid_search.fit(X_train_tr, y_train)

# # Get the best parameters and best score
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

# print(f"Best Parameters: {best_params}")
# print(f"Best Cross-Validation Accuracy: {best_score:.4f}")

# # Evaluate the best model on the test set
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_test_tr)

# # Calculate accuracy and print classification report
# test_accuracy = accuracy_score(y_test, y_pred)
# print(f"Test Accuracy: {test_accuracy:.4f}")
# print("Classification Report:")
# print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [43]:
model = RandomForestClassifier(
    n_estimators=400,
    n_jobs=-1,
    max_depth=10
)

In [44]:
model.fit(X_train_tr,y_train)

In [45]:
y_pred = model.predict(X_test_tr)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Accuracy : {accuracy}")
print(f"Precision : {precision}")
print(f"Recall : {recall}")
print(f"F1 - Score : {f1}")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy : 0.8950555555555556
Precision : 0.8011244475308642
Recall : 0.8950555555555556
F1 - Score : 0.8454891416584418


In [34]:
import pickle

In [36]:
!mkdir '../artifacts/'

In [46]:
with open('../artifacts/preproccessor.pkl','wb') as file:
    pickle.dump(transformer,file)

In [47]:
with open('../artifacts/model.pkl','wb') as file:
    pickle.dump(model,file)