In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
import joblib
import numpy as np

In [5]:
df = pd.read_csv("pp_Typhoid_data.csv")

In [6]:
df = df.drop(columns=['Age', 'Gastrointestinal Symptoms','Neurological Symptoms', 'Complications', 'Ongoing Infection in Society', 'Gender','Widal Test'])

KeyError: "['Age', 'Gastrointestinal Symptoms', 'Neurological Symptoms', 'Complications', 'Ongoing Infection in Society', 'Gender', 'Widal Test'] not found in axis"

In [4]:
df.to_csv('pp_Typhoid_data.csv', index=False)
from google.colab import files
files.download('pp_Typhoid_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [7]:
X = df.drop('Typhoid Status', axis=1)
y = df['Typhoid Status']

In [8]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [9]:
categorical_cols = ['Previous History of Typhoid','Typhoid Vaccination Status','Blood Culture Result',
                   'Skin Manifestations','Sanitation Facilities','Hand Hygiene','Consumption of Street Food',
                   'Location', 'Socioeconomic Status', 'Water Source Type', 'Typhidot Test','Weather Condition']
numerical_cols = ['Fever Duration (Days)', 'White Blood Cell Count', 'Platelet Count']

In [10]:
# Manual preprocessing instead of ColumnTransformer
def manual_preprocess(X_train, X_test):
    """
    Manual preprocessing to avoid ColumnTransformer compatibility issues
    """
    # Handle numerical columns
    num_imputer = SimpleImputer(strategy='mean')
    scaler = StandardScaler()
    
    # Handle categorical columns
    cat_imputer = SimpleImputer(strategy='most_frequent')
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    
    # Process numerical columns
    X_train_num = num_imputer.fit_transform(X_train[numerical_cols])
    X_test_num = num_imputer.transform(X_test[numerical_cols])
    
    X_train_num_scaled = scaler.fit_transform(X_train_num)
    X_test_num_scaled = scaler.transform(X_test_num)
    
    # Process categorical columns
    X_train_cat = cat_imputer.fit_transform(X_train[categorical_cols])
    X_test_cat = cat_imputer.transform(X_test[categorical_cols])
    
    X_train_cat_encoded = encoder.fit_transform(X_train_cat)
    X_test_cat_encoded = encoder.transform(X_test_cat)
    
    # Combine numerical and categorical features
    X_train_processed = np.hstack([X_train_num_scaled, X_train_cat_encoded])
    X_test_processed = np.hstack([X_test_num_scaled, X_test_cat_encoded])
    
    return X_train_processed, X_test_processed, scaler, encoder, num_imputer, cat_imputer

# Save preprocessors for later use
def save_preprocessors(scaler, encoder, num_imputer, cat_imputer):
    """Save all preprocessors separately"""
    joblib.dump(scaler, 'models/Typhoid_scaler.pkl')
    joblib.dump(encoder, 'models/Typhoid_encoder.pkl')
    joblib.dump(num_imputer, 'models/Typhoid_num_imputer.pkl')
    joblib.dump(cat_imputer, 'models/Typhoid_cat_imputer.pkl')

In [11]:
# Split data first
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Apply manual preprocessing
X_train_processed, X_test_processed, scaler, encoder, num_imputer, cat_imputer = manual_preprocess(X_train, X_test)

# Train model directly on processed data
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
model.fit(X_train_processed, y_train)



0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [14]:
# Save model and preprocessors
joblib.dump(model, 'models/Typhoid_model.pkl')
save_preprocessors(scaler, encoder, num_imputer, cat_imputer)
print("✅ Model and preprocessors saved successfully!")

✅ Model and preprocessors saved successfully!


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31087 entries, 0 to 31086
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Location                     31087 non-null  object
 1   Socioeconomic Status         31087 non-null  object
 2   Water Source Type            31087 non-null  object
 3   Sanitation Facilities        31087 non-null  object
 4   Hand Hygiene                 31087 non-null  object
 5   Consumption of Street Food   31087 non-null  object
 6   Fever Duration (Days)        31087 non-null  int64 
 7   Skin Manifestations          31087 non-null  object
 8   White Blood Cell Count       31087 non-null  int64 
 9   Platelet Count               31087 non-null  int64 
 10  Blood Culture Result         31087 non-null  object
 11  Typhidot Test                31087 non-null  object
 12  Typhoid Vaccination Status   31087 non-null  object
 13  Previous History of Typhoid  31

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


pipeline.fit(X_train, y_train)


y_pred = pipeline.predict(X_test)



In [13]:
# Make predictions
y_pred = model.predict(X_test_processed)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9197491154712126
Confusion Matrix:
 [[1085    0    0    0]
 [   0  253    0    0]
 [   0    0 4381    0]
 [ 499    0    0    0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.68      1.00      0.81      1085
           1       1.00      1.00      1.00       253
           2       1.00      1.00      1.00      4381
           3       0.00      0.00      0.00       499

    accuracy                           0.92      6218
   macro avg       0.67      0.75      0.70      6218
weighted avg       0.86      0.92      0.89      6218



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
