In [14]:
import pandas as pd

# Load your data
df = pd.read_csv("synthetic.csv")

# Combine first 10 and last 10 rows
preview_df = pd.concat([df.head(10), df.tail(10)])

# Set display options for full horizontal view
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.max_colwidth', None)

# Save output to text file
with open("table_preview.txt", "w", encoding="utf-8") as f:
    f.write(preview_df.to_string(index=True))

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# === Step 1: Split the data (Example using 70/30 split) ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# === Step 2: Scale features to range [0, 1] for correlation ===
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === Step 3: Convert to DataFrame and attach target for correlation ===
X_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_df['target'] = y_train.values if hasattr(y_train, 'values') else y_train

# === Step 4: Calculate absolute Pearson correlation between features and target ===
correlations = X_df.corr()['target'].drop('target').abs()

# === Step 5: Select top 10 most correlated features ===
top_k = correlations.sort_values(ascending=False).head(10).index.tolist()

# === Step 6: Select those features from the scaled data ===
X_train_selected = X_train_scaled[:, X.columns.get_indexer(top_k)]
X_test_selected = X_test_scaled[:, X.columns.get_indexer(top_k)]

# === Display selected features and their correlation scores ===
print("Top 10 Correlated Features with Target:")
print(correlations[top_k])

In [None]:
# Convert to DataFrame and attach target for correlation ===
X_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_df['target'] = y_train.values if hasattr(y_train, 'values') else y_train

# Calculate absolute Pearson correlation between features and target ===
correlations = X_df.corr()['target'].drop('target').abs()

# Select top 10 most correlated features ===
top_k = correlations.sort_values(ascending=False).head(10).index.tolist()

# Select those features from the scaled data ===
X_train_selected = X_train_scaled[:, X.columns.get_indexer(top_k)]
X_test_selected = X_test_scaled[:, X.columns.get_indexer(top_k)]

# Display selected features and their correlation scores ===
print("Top 10 Correlated Features with Target:")
print(correlations[top_k])

In [None]:
# Handling missing values using pandas
import pandas as pd

df = pd.read_csv("nyc.csv")
print("Missing values per column before:")
print(df.isnull().sum())

# Drop rows with missing values
df_cleaned = df.dropna()


print("Missing values after handling:")
print(df.isnull().sum())


Missing values per column before:
Year                         0
Borough                      0
Gender                       0
Age                          0
Race                         0
HIV_diagnosed                0
Concurrent_diagnosed         0
AIDS_diagnosed               0
Death_Status                 0
Poverty_Level                0
Transmission_Category        0
Education_Level           3768
Linked_to_Care_3mo           0
Housing_Status               0
Employment_Status            0
Substance_Use            15889
dtype: int64
Missing values after handling:
Year                         0
Borough                      0
Gender                       0
Age                          0
Race                         0
HIV_diagnosed                0
Concurrent_diagnosed         0
AIDS_diagnosed               0
Death_Status                 0
Poverty_Level                0
Transmission_Category        0
Education_Level           3768
Linked_to_Care_3mo           0
Housing_Status         

In [None]:
# Handling inconsistent data

# 1. Standardize 'Gender'
df['Gender'] = df['Gender'].str.lower().str.strip().replace({
    'male': 'Male', 'm': 'Male',
    'female': 'Female', 'f': 'Female'
})

# 2. Standardize 'Transmission Category'
df['Transmission Category'] = df['Transmission Category'].str.lower().str.strip().replace({
    'heterosexual contact': 'Heterosexual',
    'hetero contact': 'Heterosexual',
    'msm': 'Men who have sex with men',
    'msm contact': 'Men who have sex with men',
    'idu': 'Injection drug use',
    'idus': 'Injection drug use'
})

# 3. Clean 'Education Level'
df['Education Level'] = df['Education Level'].str.lower().str.strip().replace({
    'highschool': 'high school',
    'primary': 'primary school',
    'bachelor': 'bachelor\'s',
    'bachelors': 'bachelor\'s'
})

# 4. Convert 'Poverty Level' to numeric
df['Poverty Level'] = df['Poverty Level'].astype(str).str.replace(',', '').str.strip()
df['Poverty Level'] = pd.to_numeric(df['Poverty Level'], errors='coerce')

# 5. Normalize boolean column
df['Linked to Care Within 3 Months'] = df['Linked to Care Within 3 Months'].astype(str).str.lower().str.strip()
df['Linked to Care Within 3 Months'] = df['Linked to Care Within 3 Months'].map({
    'yes': True, 'no': False, 'true': True, 'false': False
})

In [None]:
# List of categorical columns to encode
categorical_cols = [
    'Gender',
    'Race',
    'Transmission_Category',
    'Education_Level',
    'Housing_Status',
    'Employment_Status',
    'Substance_Use',
    'Borough',
    'Concurrent_diagnosed',
    'Death_Status'
]

# One-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("Shape after encoding:", df_encoded.shape)
df_encoded.head()





In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Split first
X = df_encoded.drop('AIDS_diagnosed', axis=1)
y = df_encoded['AIDS_diagnosed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop('AIDS_diagnosed', axis=1)
y = df_encoded['AIDS_diagnosed']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y, 
    random_state=42
)

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

selector = SelectKBest(score_func=corr, k='all')
selector.fit(X_train_resampled, y_train_resampled)

chi2_scores = pd.DataFrame({
    'Feature': X_train_resampled.columns,
    'Chi2 Score': selector.scores_,
    'P-Value': selector.pvalues_
}).sort_values(by='Chi2 Score', ascending=False)

print(chi2_scores)