In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
try:
    df = pd.read_csv('gender_detection.csv')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: 'gender_detection.csv' not found. Please make sure the data file is in the correct directory.")
    df = None

if df is not None:
    # Data Preprocessing
    print("\nStarting data preprocessing...")
    # Check for missing values
    print("Missing values per column:")
    print(df.isnull().sum())

    # Encode the 'Gender' column
    df['Gender'] = df['Gender'].astype('category').cat.codes
    print("'Gender' column encoded.")

    # Separate features (X) and target (y)
    X = df.drop('Gender', axis=1)
    y = df['Gender']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("Data successfully preprocessed and split.")
    print("Shape of X_train:", X_train.shape)
    print("Shape of X_test:", X_test.shape)
    print("Shape of y_train:", y_train.shape)
    print("Shape of y_test:", y_test.shape)

    # Model Selection and Training
    print("\nStarting model training...")
    # Instantiate the Logistic Regression model
    model = LogisticRegression()

    # Train the model
    model.fit(X_train, y_train)
    print("Logistic Regression model trained successfully.")

    # Model Evaluation
    print("\nStarting model evaluation...")
    # Make predictions on the testing data
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Print the evaluation metrics
    print("Model evaluation metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

Data loaded successfully.

Starting data preprocessing...
Missing values per column:
Height         0
Weight         0
Voice_Pitch    0
Long_Hair      0
Makeup         0
Gender         0
dtype: int64
'Gender' column encoded.
Data successfully preprocessed and split.
Shape of X_train: (800, 5)
Shape of X_test: (200, 5)
Shape of y_train: (800,)
Shape of y_test: (200,)

Starting model training...
Logistic Regression model trained successfully.

Starting model evaluation...
Model evaluation metrics:
Accuracy: 0.9650
Precision: 1.0000
Recall: 0.6316
F1-score: 0.7742
