In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('data/meningitis_data.csv')
df.head()

In [None]:
selected_columns = [
    'age',                      # Patient age
    'sexofthepatient',          # Gender
    'whitecellcount',           # CSF WBC count
    'csfproteinresult',         # CSF protein level
    'csfglucoseresult',         # CSF glucose level
    'csf_gram_resul',           # Gram stain result
    'csfcultureresults',        # CSF culture result
    'presenceofseizure',        # Seizure presence
    'neckstiffness',            # Neck stiffness
    'historyoffever',           # Fever history
    'alteredconsciousness',     # Consciousness level
    'bulgingfontanel',          # Fontanel bulging (infants)
    'mening_received',          # Meningitis vaccine received
    'hibvaccine',               # Hib vaccine received
    'pneumoduringcampaign',     # Pneumococcal vaccine during campaign
    'target'                    # Label: bacterial meningitis (1 = yes, 0 = no)
]

df = df[selected_columns]
df.head()

In [None]:
df.shape

In [None]:
df['age'].fillna(df['age'].mean(), inplace=True)

In [None]:
df['sexofthepatient'].fillna('M', inplace=True)

In [None]:
df['whitecellcount'].fillna(df['whitecellcount'].mean(), inplace=True)

In [None]:
df['csfproteinresult'].fillna(df['csfproteinresult'].mean(), inplace=True)

In [None]:
df['csfglucoseresult'].fillna(df['csfglucoseresult'].mean(), inplace=True)

In [None]:
df['csf_gram_resul'].fillna(df['csf_gram_resul'].mode(), inplace=True)

In [None]:
df.head()

In [None]:
df['csfcultureresults'].fillna(df['csfcultureresults'].mean(), inplace=True)

In [None]:
df['presenceofseizure'].fillna(df['presenceofseizure'].mean(), inplace=True)

In [None]:
df['neckstiffness'].fillna(df['neckstiffness'].mean(), inplace=True)

In [None]:
df['historyoffever'].fillna(df['historyoffever'].mean(), inplace=True)

In [None]:
df['alteredconsciousness'].fillna(df['alteredconsciousness'].mean(), inplace=True)

In [None]:
df['bulgingfontanel'].fillna(df['bulgingfontanel'].mean(), inplace=True)

In [None]:
df['mening_received'].fillna(df['mening_received'].mean(), inplace=True)

In [None]:
df.head()

In [None]:
df['hibvaccine'].fillna(df['hibvaccine'].mean(), inplace=True)

In [None]:
df['pneumoduringcampaign'].fillna(df['pneumoduringcampaign'].mean(), inplace=True)

In [None]:
df['sexofthepatient'] = df['sexofthepatient'].map({'F': 0, 'M': 1})

In [None]:
df['target'].isna().sum()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop(columns=['target'])
y = df['target']  

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(class_weight='balanced')
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
feature_importance = pd.Series(model.feature_importances_, index=X.columns)
feature_importance.nlargest(10).plot(kind='barh')
plt.title("Top 10 Feature Importances")
plt.show()


In [None]:
# import joblib
# joblib.dump(model, 'meningitis_model.pkl')

In [None]:
# df.to_csv('meningitis_data_cleaned.csv', index=False)
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'n_estimators': [100, 200],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5],
#     'min_samples_leaf': [1, 2],
#     'class_weight': ['balanced']
# }

# grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, n_jobs=-1)
# grid.fit(X_train, y_train)

# print("Best Parameters:", grid.best_params_)
# print("Best Score:", grid.best_score_)

# y_pred = grid.predict(X_test)
# print(classification_report(y_test, y_pred))