In [3]:
import pandas as pd
import numpy as np

# Define column names for the Hepatitis dataset
columns = ['class', 'age', 'sex', 'steroid', 'antivirals', 'fatigue', 'malaise', 'anorexia',
           'liver_big', 'liver_firm', 'spleen_palpable', 'spiders', 'ascites', 'varices',
           'bilirubin', 'alk_phosphate', 'sgot', 'albumin', 'protime', 'histology']

# Load the dataset
df = pd.read_csv('hepatitis.csv', header=None, names=columns)

# Replace '?' with NaN for proper handling
df.replace('?', np.nan, inplace=True)

# Convert relevant columns to numeric
numeric_cols = ['age', 'bilirubin', 'alk_phosphate', 'sgot', 'albumin', 'protime']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# a. Create data subsets for different sex (1: male, 2: female)
male_subset = df[df['sex'] == 1]
female_subset = df[df['sex'] == 2]

# b. Merge two subsets (male and female)
merged_subsets = pd.concat([male_subset, female_subset], axis=0)

# c. Sort Data using age, SGOT, and protime
# Handle NaN values in sorting columns by filling them with a large value to sort them last
sorted_df = df.sort_values(by=['age', 'sgot', 'protime'],
                          na_position='last',
                          ascending=[True, True, True])

# d. Transposing Data
transposed_df = df.transpose()

# e. Melting Data to long format
melted_df = pd.melt(df, id_vars=['sex'],
                    value_vars=['age', 'sgot', 'protime', 'bilirubin', 'albumin'],
                    var_name='measurement', value_name='value')

# f. Casting data to wide format
wide_df = melted_df.pivot_table(index='sex', columns='measurement', values='value', aggfunc='mean')
wide_df = wide_df.reset_index()

# Print results to verify
print("Male Subset:\n", male_subset.head())
print("Female Subset:\n", female_subset.head())
print("Merged Subsets (Male and Female):\n", merged_subsets.head())
print("Sorted by Age, SGOT, Protime:\n", sorted_df.head())
print("Transposed DataFrame:\n", transposed_df)
print("Melted DataFrame:\n", melted_df.head())
print("Wide Format DataFrame:\n", wide_df)

Male Subset:
    class  age  sex steroid  antivirals fatigue malaise anorexia liver_big  \
1      2   50    1       1           2       1       2        2         1   
2      2   78    1       2           2       1       2        2         2   
3      2   31    1     NaN           1       2       2        2         2   
4      2   34    1       2           2       2       2        2         2   
5      2   34    1       2           2       2       2        2         2   

  liver_firm spleen_palpable spiders ascites varices  bilirubin  \
1          2               2       2       2       2        0.9   
2          2               2       2       2       2        0.7   
3          2               2       2       2       2        0.7   
4          2               2       2       2       2        1.0   
5          2               2       2       2       2        0.9   

   alk_phosphate   sgot  albumin  protime  histology  
1          135.0   42.0      3.5      NaN          1  
2         