In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('pcos_data.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
import pandas as pd


num_duplicates = df.duplicated().sum()
duplicate_rows = df[df.duplicated()]
print(duplicate_rows)


In [None]:
# Check Null and Dtypes
df.info()

In [None]:
df.nunique()

In [None]:
df.describe()

## LETS TRAIN THE MODEL

In [None]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [None]:
df.head()

In [None]:
df.columns = df.columns.str.strip()


In [None]:
# Check the exact values in your columns
print(df[["I_beta-HCG(mIU/mL)", "II_beta-HCG(mIU/mL)", "AMH(ng/mL)"]].dtypes)

In [None]:
# This will show you exactly what non-numeric values exist
for col in ["I_beta-HCG(mIU/mL)", "II_beta-HCG(mIU/mL)", "AMH(ng/mL)"]:
    non_numeric = df[col][pd.to_numeric(df[col], errors='coerce').isna()]
    if len(non_numeric) > 0:
        print(f"{col}: {non_numeric.unique()}")

In [None]:
# Convert problematic columns to numeric, replacing 'a' and other text with NaN
df["I_beta-HCG(mIU/mL)"] = pd.to_numeric(df["I_beta-HCG(mIU/mL)"], errors='coerce')
df["II_beta-HCG(mIU/mL)"] = pd.to_numeric(df["II_beta-HCG(mIU/mL)"], errors='coerce')
df["AMH(ng/mL)"] = pd.to_numeric(df["AMH(ng/mL)"], errors='coerce')

In [None]:
# Option 1: Drop rows with NaN
df = df.dropna(subset=["I_beta-HCG(mIU/mL)", "II_beta-HCG(mIU/mL)", "AMH(ng/mL)", "PCOS"])


In [None]:
X = df[["I_beta-HCG(mIU/mL)", "II_beta-HCG(mIU/mL)", "AMH(ng/mL)"]]
y = df["PCOS"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
from sklearn.preprocessing import StandardScaler

# Define X and y
X = df[["I_beta-HCG(mIU/mL)", "II_beta-HCG(mIU/mL)", "AMH(ng/mL)"]]
y = df["PCOS"]

# Apply standard scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Optional: Convert back to DataFrame for inspection
import pandas as pd
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

print(X_scaled_df)


In [None]:
print(df["PCOS"].unique())
print(df["PCOS"].value_counts())

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Classification models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "XGBoost": XGBClassifier(),
    "AdaBoost": AdaBoostClassifier()
}

# Classification evaluation function
def evaluate_classification(true, predicted):
    acc = accuracy_score(true, predicted)
    return acc

In [None]:
# After cleaning the data as we discussed
X = df[["I_beta-HCG(mIU/mL)", "II_beta-HCG(mIU/mL)", "AMH(ng/mL)"]]
y = df["PCOS"]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# For CLASSIFICATION (if PCOS is binary):
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name}: {acc:.4f}")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Best model appears to be Logistic Regression (70.37%)
best_model = LogisticRegression()
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

print("=== DETAILED EVALUATION ===")
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
import pandas as pd

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results.append({'Model': name, 'Accuracy': acc})

# Create results DataFrame
results_df = pd.DataFrame(results).sort_values('Accuracy', ascending=False)
print(results_df)

In [None]:
print("Before SMOTE:")
print(y.value_counts())
# Output: 0: 363, 1: 177 (1 has ~50% fewer samples)

In [None]:
from imblearn.over_sampling import SMOTE

# SMOTE automatically detects minority class (1) and increases it
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

print("After SMOTE:")
print(pd.Series(y_resampled).value_counts())
# Output will be: 0: 363, 1: 363 (now 1 is doubled/increased to match 0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, 
    test_size=0.2, 
    random_state=42,
    stratify=y_resampled  # This ensures balanced split
)

print(f"Training set: {pd.Series(y_train).value_counts()}")
print(f"Test set: {pd.Series(y_test).value_counts()}")

In [None]:
results_balanced = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results_balanced.append({'Model': name, 'Accuracy': acc})

results_balanced_df = pd.DataFrame(results_balanced).sort_values('Accuracy', ascending=False)
print("Results with SMOTE:")
print(results_balanced_df)

In [None]:
print("=== COMPARISON ===")
print("Without SMOTE - Best accuracy:", results_df['Accuracy'].max())
print("With SMOTE - Best accuracy:", results_balanced_df['Accuracy'].max())