<a href="https://colab.research.google.com/github/shivamsinghtomar78/ML-Projects-/blob/main/liver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
 !pip install xgboost



In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [46]:
# loading the csv data to a Pandas DataFrame
data = pd.read_csv('/content/liver.csv')

In [47]:
# Basic preprocessing
data['is_patient'] = data['is_patient'].replace({2: 0})

In [48]:
# Select important features
features = [
    'tot_bilirubin',
    'direct_bilirubin',
    'sgpt',
    'sgot',
    'alkphos',
    'albumin',
    'ag_ratio',
    'tot_proteins'
]

In [49]:
X = data[features]
y = data['is_patient']

In [50]:
# Split the data first (to prevent data leakage)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [51]:
# Create and fit imputer for missing values
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [52]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

In [53]:
# Dictionary to store models
models = {
    'SVM': SVC(
        kernel='rbf',
        C=10,
        gamma='scale',
        class_weight='balanced',
        random_state=42
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=200,
        max_depth=6,
        min_samples_split=5,
        min_samples_leaf=2,
        class_weight='balanced',
        random_state=42
    ),
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        min_samples_split=5,
        random_state=42
    ),
    'XGBoost': XGBClassifier(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.1,
        scale_pos_weight=2,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )
}

In [54]:

# Train and evaluate each model
results = {}
for name, model in models.items():
    # Train model
    model.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = model.predict(X_test_scaled)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy

    # Print results
    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy:.2f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))


SVM Results:
Accuracy: 0.72

Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.94      0.66        34
           1       0.96      0.63      0.76        83

    accuracy                           0.72       117
   macro avg       0.74      0.78      0.71       117
weighted avg       0.83      0.72      0.73       117


Random Forest Results:
Accuracy: 0.73

Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.71      0.60        34
           1       0.86      0.73      0.79        83

    accuracy                           0.73       117
   macro avg       0.69      0.72      0.70       117
weighted avg       0.76      0.73      0.74       117


Gradient Boosting Results:
Accuracy: 0.71

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.26      0.35        34
           1       0.75      0.89      0.81        83


Parameters: { "use_label_encoder" } are not used.




XGBoost Results:
Accuracy: 0.69

Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.06      0.10        34
           1       0.71      0.95      0.81        83

    accuracy                           0.69       117
   macro avg       0.52      0.51      0.46       117
weighted avg       0.60      0.69      0.61       117



In [55]:
# Find best model
best_model = max(results.items(), key=lambda x: x[1])
print(f"\nBest performing model: {best_model[0]} with accuracy: {best_model[1]:.2f}")

# Print feature importance for Random Forest
rf_model = models['Random Forest']
importance = pd.DataFrame({
    'feature': features,
    'importance': rf_model.feature_importances_
})
print("\nFeature Importance (Random Forest):")
print(importance.sort_values('importance', ascending=False))

# Print dataset information
print("\nDataset Information:")
print(f"Total samples: {len(X)}")
print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")


Best performing model: Random Forest with accuracy: 0.73

Feature Importance (Random Forest):
            feature  importance
7      tot_proteins    0.174808
5           albumin    0.161123
0     tot_bilirubin    0.150104
6          ag_ratio    0.145571
1  direct_bilirubin    0.144078
3              sgot    0.085725
2              sgpt    0.074194
4           alkphos    0.064398

Dataset Information:
Total samples: 583
Training samples: 466
Testing samples: 117


In [56]:
import pickle

In [57]:
filename = 'liver_model.pkl'

In [58]:
pickle.dump(model, open(filename, 'wb'))