In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, classification_report


def load_data(filepath):
    return pd.read_csv(filepath)


def preprocess_data(df):
    # Selecting features and target variables
    X = df[['mass', 'width', 'height', 'color_score']]  # Features
    y = df['fruit_label']  # Target
    return X, y


def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Function to train the logistic regression model
def train_model(X_train, y_train):
    # Initialize logistic regression model with increased max_iter
    model = LogisticRegression(max_iter=10000)
    model.fit(X_train, y_train)
    return model

# Function to evaluate the trained model
def evaluate_model(model, X_test, y_test):
    # Make predictions
    y_pred = model.predict(X_test)
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    return accuracy, f1, recall, precision, conf_matrix, class_report


def main():

    filepath = "C:\\Users\\Subhaan khokhar\\Downloads\\fruits.csv"
    df = load_data(filepath)

    print("First few rows of the dataset:")
    print(df.head())

    # Preprocess the data
    X, y = preprocess_data(df)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = split_data(X, y)

    # Train the logistic regression model
    model = train_model(X_train, y_train)

    # Evaluate the model
    accuracy, f1, recall, precision, conf_matrix, class_report = evaluate_model(model, X_test, y_test)

    # Print evaluation metrics
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    print(f"Recall: {recall}")
    print(f"Precision: {precision}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)

if __name__ == "__main__":
    main()


First few rows of the dataset:
   Unnamed: 0  fruit_label fruit_name fruit_subtype    mass  width  height  \
0           0            1      apple  granny_smith  198.04   8.47    7.29   
1           1            1      apple  granny_smith  169.98   7.84    6.41   
2           2            1      apple  granny_smith  183.37   7.58    7.62   
3           3            2   mandarin      mandarin   88.73   6.31    4.55   
4           4            2   mandarin      mandarin   80.54   6.30    4.83   

   color_score  
0         0.54  
1         0.59  
2         0.60  
3         0.82  
4         0.83  
Accuracy: 0.7966101694915254
F1 Score: 0.7907088137453381
Recall: 0.7966101694915254
Precision: 0.7986715529088411
Confusion Matrix:
[[28  0  7  0]
 [ 0  9  0  0]
 [14  0 23  3]
 [ 0  0  0 34]]
Classification Report:
              precision    recall  f1-score   support

           1       0.67      0.80      0.73        35
           2       1.00      1.00      1.00         9
           3      