In [2]:
import pandas as pd
import numpy as np

# Load your data
df = pd.read_csv('nyc.csv')
df.head()

Unnamed: 0,Year,Borough,Gender,Age,Race,HIV_diagnosed,Concurrent_diagnosed,AIDS_diagnosed,Death_Status,Poverty_Level,Transmission_Category,Education_Level,Linked_to_Care_3mo,Housing_Status,Employment_Status,Substance_Use
0,2020,Bronx,Male,44,Hispanic,True,Hepatitis B,False,Alive,Medium,MSM,Primary,True,Housed,Informal,Alcohol
1,2012,Bronx,Transgender,64,Other,True,Gonorrhea,True,Deceased,High,MSM,Primary,True,Homeless,Retired,
2,2010,Bronx,Male,54,White,True,"Gonorrhea, Tuberculosis, Hepatitis C",False,Alive,Medium,MSM,Primary,True,Housed,Employed,Alcohol
3,2013,Staten Island,Female,32,White,True,"Gonorrhea, Hepatitis C",False,Alive,Low,Heterosexual,Primary,True,Housed,Retired,Both
4,2022,Bronx,Male,70,Asian,True,"Hepatitis C, Hepatitis B",False,Alive,Medium,Heterosexual,Primary,True,Housed,Retired,


In [None]:
def handle_missing_values(df, strategy='mean', fill_value=None):
    """
    Handles missing values in a DataFrame.
    strategy: 'mean', 'median', 'mode', or 'constant'
    fill_value: value to use if strategy is 'constant'
    """
    df_cleaned = df.copy()
    for col in df_cleaned.columns:
        if df_cleaned[col].isnull().sum() > 0:
            if strategy == 'mean' and df_cleaned[col].dtype in [np.float64, np.int64]:
                df_cleaned[col].fillna(df_cleaned[col].mean(), inplace=True)
            elif strategy == 'median' and df_cleaned[col].dtype in [np.float64, np.int64]:
                df_cleaned[col].fillna(df_cleaned[col].median(), inplace=True)
            elif strategy == 'mode':
                df_cleaned[col].fillna(df_cleaned[col].mode()[0], inplace=True)
            elif strategy == 'constant':
                df_cleaned[col].fillna(fill_value, inplace=True)
    return df_cleaned

df = handle_missing_values(df, strategy='mean')

In [4]:
def handle_inconsistent_data(df, column, valid_values):
    """
    Replaces values not in valid_values with np.nan in the specified column.
    """
    df_cleaned = df.copy()
    df_cleaned[column] = df_cleaned[column].apply(lambda x: x if x in valid_values else np.nan)
    return df_cleaned


valid_genders = ['Male', 'Female', 'Transgender']
df = handle_inconsistent_data(df, 'Gender', valid_genders)

In [5]:
from sklearn.preprocessing import LabelEncoder

def encode_categorical(df, columns, encoding_type='onehot'):
    """
    Encodes categorical columns using label or one-hot encoding.
    encoding_type: 'label' or 'onehot'
    """
    df_encoded = df.copy()
    if encoding_type == 'label':
        le = LabelEncoder()
        for col in columns:
            df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
    elif encoding_type == 'onehot':
        df_encoded = pd.get_dummies(df_encoded, columns=columns)
    return df_encoded

# Example usage:
categorical_columns = [
    'Borough', 'Gender', 'Race', 'Transmission_Category', 
    'Education_Level', 'Housing_Status', 'Employment_Status', 'Substance_Use'
]
df = encode_categorical(df, categorical_columns, encoding_type='onehot')

In [8]:
from imblearn.over_sampling import SMOTE

# Check for non-numeric columns and encode if necessary
non_numeric_cols = X.select_dtypes(include=['object']).columns
if len(non_numeric_cols) > 0:
    X = pd.get_dummies(X, columns=non_numeric_cols)

# Now run SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

print("After SMOTE, X shape:", X_res.shape)
print("After SMOTE, y value counts:\n", pd.Series(y_res).value_counts())

ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead