In [76]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import numpy as np
from scipy.stats import zscore

In [77]:
df=pd.read_csv('hepatitis_csv.csv')

df

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,False,1.0,85.0,18.0,4.0,,False,live
1,50,female,False,False,True,False,False,False,False,False,False,False,False,0.9,135.0,42.0,3.5,,False,live
2,78,female,True,False,True,False,False,True,False,False,False,False,False,0.7,96.0,32.0,4.0,,False,live
3,31,female,,True,False,False,False,True,False,False,False,False,False,0.7,46.0,52.0,4.0,80.0,False,live
4,34,female,True,False,False,False,False,True,False,False,False,False,False,1.0,,200.0,4.0,,False,live
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,46,female,True,False,True,True,True,True,False,False,True,True,True,7.6,,242.0,3.3,50.0,True,die
151,44,female,True,False,True,False,False,True,True,False,False,False,False,0.9,126.0,142.0,4.3,,True,live
152,61,female,False,False,True,True,False,False,True,False,True,False,False,0.8,75.0,20.0,4.1,,True,live
153,53,male,False,False,True,False,False,True,False,True,True,False,True,1.5,81.0,19.0,4.1,48.0,True,live


In [78]:
df.dtypes
(df == '?').sum().sum()
df.isna().sum().sum()

167

In [79]:
# a. Data cleaning(Remove NA, ?, Negative values etc.)  

print("After replacing '?':", df.shape)
df.dropna(inplace=True)
print("After dropna:", df.shape)

df = df.apply(pd.to_numeric, errors='coerce')
print("After to_numeric:", df.shape)

never_negative_cols = ['age', 'bilirubin', 'alk_phosphate', 'sgot', 'albumin', 'protime']
df = df[(df[never_negative_cols] >= 0).all(axis=1)]
print("After removing negatives:", df.shape)



# df.replace('?', np.nan, inplace=True)
# display(df)
# df.dropna(inplace=True)

display(df)

After replacing '?': (155, 20)
After dropna: (80, 20)
After to_numeric: (80, 20)
After removing negatives: (80, 20)


Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
5,34,,True,False,False,False,False,True,False,False,False,False,False,0.9,95.0,28.0,4.0,75.0,False,
10,39,,False,True,False,False,False,False,True,False,False,False,False,1.3,78.0,30.0,4.4,85.0,False,
11,32,,True,True,True,False,False,True,True,False,True,False,False,1.0,59.0,249.0,3.7,54.0,False,
12,41,,True,True,True,False,False,True,True,False,False,False,False,0.9,81.0,60.0,3.9,52.0,False,
13,30,,True,False,True,False,False,True,True,False,False,False,False,2.2,57.0,144.0,4.9,78.0,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,45,,True,True,False,False,False,True,False,False,False,False,False,1.3,85.0,44.0,4.2,85.0,True,
143,49,,False,False,True,True,False,True,False,True,True,False,False,1.4,85.0,70.0,3.5,35.0,True,
145,31,,False,False,True,False,False,True,False,False,False,False,False,1.2,75.0,173.0,4.2,54.0,True,
153,53,,False,False,True,False,False,True,False,True,True,False,True,1.5,81.0,19.0,4.1,48.0,True,


In [80]:
# b. Error correcting(Outlier detection and removal)

Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1
outlier_mask = ((df[num_cols] < (Q1 - 1.5 * IQR)) | (df[num_cols] > (Q3 + 1.5 * IQR)))
df = df[~outlier_mask.any(axis=1)]  # Keep rows that are not outliers


In [81]:
# c. Data transformation 

df['class'] = df['class'].map({'live': 1, 'die': 0})
df = pd.get_dummies(df, drop_first=True)

scaler = StandardScaler()

# Separate features (exclude target variable) and target variable
features = df.drop(['class'], axis=1)
features_scaled = scaler.fit_transform(features)  # Scale features

from sklearn.impute import SimpleImputer
# Handle missing values (if any)
imputer = SimpleImputer(strategy='mean')  # You can also use 'median' or 'most_frequent' based on your data
features_scaled = imputer.fit_transform(features_scaled)  # Impute missing values in scaled data

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [82]:
# d. Build Data model using regression and Naïve Bayes methods and compare accuracy of prediction
X = features_scaled  # Scaled features
y = df['class']  # Target variable


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Logistic Regression
lr=LogisticRegression()
lr.fit(X_train,y_train)
lr_pred=lr.predict(X_test)

# Naive Bayes
nb=GaussianNB()
nb.fit(X_train,y_train)
nb_pred=nb.predict(X_test)

print('Accuracy of Logistic Regression:', accuracy_score(y_test,lr_pred))
print('Accuracy of Naive Bayes:', accuracy_score(y_test,nb_pred))


ValueError: Input y contains NaN.

In [86]:
import pandas as pd
import numpy as np

# Load the dataset (Assuming the file is in CSV format)
df = pd.read_csv('hepatitis.csv')

# Check for missing values
print(df.isnull().sum())

# Replace "?" with NaN (if needed)
df.replace('?', np.nan, inplace=True)

# Remove rows with NaN values
df.dropna(inplace=True)

# Remove rows with negative values (if applicable)
# Assuming numeric columns that should not have negative values
numeric_columns = df.select_dtypes(include=[np.number]).columns
df = df[(df[numeric_columns] >= 0).all(axis=1)]

# Check the cleaned data
display(df.head())


age                 0
sex                 0
steroid             1
antivirals          0
fatigue             1
malaise             1
anorexia            1
liver_big          10
liver_firm         11
spleen_palpable     5
spiders             5
ascites             5
varices             5
bilirubin           6
alk_phosphate      29
sgot                4
albumin            16
protime            67
histology           0
class               0
dtype: int64


Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
5,34,female,True,False,False,False,False,True,False,False,False,False,False,0.9,95.0,28.0,4.0,75.0,False,live
10,39,female,False,True,False,False,False,False,True,False,False,False,False,1.3,78.0,30.0,4.4,85.0,False,live
11,32,female,True,True,True,False,False,True,True,False,True,False,False,1.0,59.0,249.0,3.7,54.0,False,live
12,41,female,True,True,True,False,False,True,True,False,False,False,False,0.9,81.0,60.0,3.9,52.0,False,live
13,30,female,True,False,True,False,False,True,True,False,False,False,False,2.2,57.0,144.0,4.9,78.0,False,live


In [85]:
from scipy import stats

# Remove outliers using Z-Score method
z_scores = np.abs(stats.zscore(df[numeric_columns]))
df_cleaned = df[(z_scores < 3).all(axis=1)]

# Check the data after outlier removal
display(df_cleaned.head())


Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
5,34,female,True,False,False,False,False,True,False,False,False,False,False,0.9,95.0,28.0,4.0,75.0,False,live
10,39,female,False,True,False,False,False,False,True,False,False,False,False,1.3,78.0,30.0,4.4,85.0,False,live
11,32,female,True,True,True,False,False,True,True,False,True,False,False,1.0,59.0,249.0,3.7,54.0,False,live
12,41,female,True,True,True,False,False,True,True,False,False,False,False,0.9,81.0,60.0,3.9,52.0,False,live
13,30,female,True,False,True,False,False,True,True,False,False,False,False,2.2,57.0,144.0,4.9,78.0,False,live


In [88]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Encode categorical columns (e.g., `class` for 'DIE' or 'LIVE')
label_encoder = LabelEncoder()
df_cleaned['class'] = label_encoder.fit_transform(df_cleaned['class'])

# Scale the numeric features
scaler = StandardScaler()
df_cleaned[numeric_columns] = scaler.fit_transform(df_cleaned[numeric_columns])

# Check the transformed data
display(df_cleaned.head())


Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
5,-0.559582,female,True,False,False,False,False,True,False,False,False,False,False,-0.286348,-0.063141,-0.820889,0.192428,0.461539,False,1
10,-0.11623,female,False,True,False,False,False,False,True,False,False,False,False,0.463731,-0.414608,-0.785887,0.941884,0.892232,False,1
11,-0.736922,female,True,True,True,False,False,True,True,False,True,False,False,-0.098828,-0.807423,3.046821,-0.369664,-0.442915,False,1
12,0.061111,female,True,True,True,False,False,True,True,False,False,False,False,-0.286348,-0.352584,-0.260859,0.005064,-0.529053,False,1
13,-0.914263,female,True,False,True,False,False,True,True,False,False,False,False,2.151409,-0.848772,1.209221,1.878704,0.590747,False,1


In [89]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split data into features and target
X = df_cleaned.drop('class', axis=1)
y = df_cleaned['class']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression (for classification)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Predict and evaluate accuracy
y_pred_logreg = logreg.predict(X_test)
logreg_accuracy = accuracy_score(y_test, y_pred_logreg)
print(f"Logistic Regression Accuracy: {logreg_accuracy}")


ValueError: could not convert string to float: 'female'