# Data Mining Project II

## Imports

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from collections import Counter
from enum import IntEnum
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error, mean_squared_error, accuracy_score, classification_report
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from sklearn.cluster import KMeans
from kneed import KneeLocator
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import chi2_contingency

## Data preparation

In [2]:
attributes = {
    "WEIGHTLBTC_A": "weight", #pounds
    "HEIGHTTC_A": "height", #inches
    "PREGNOW_A": "pregnant",
    "AGEP_A": "age",
    "SEX_A": "gender",
    "EDUCP_A": "education",
    "MARITAL_A": "single",
    "HOUTENURE_A": "owns_home",
    "PARSTAT_A": "parent",
    "DEPFREQ_A": "depression",
    "ANXFREQ_A": "anxiety",
    "POVRATTC_A": "poverty",
    "PHSTAT_A": "health_status",
    "LSATIS4_A": "life_sat",
    #"DRK12MN_A": "alcohol",
    #"SLPHOURS_A": "sleep",
    "NOTCOV_A": "insurance",
    "URBRRL": "living_area",
    "REGION": "region",
    "CANEV_A": "cancer",
    "HYPEV_A": "hypertension",
    "CHLEV_A": "cholesterol",
    "ASEV_A": "asthma",
    "RACEALLP_A": "race",
    "PAYWORRY_A": "medical_bill_worry"
}

dataframes = []
for i in [2, 3]:
    df = pd.read_csv(f"../data/adult2{i}.csv", sep=",")
    dataframes.append(df)
    print(f"YEAR 202{i}\nfeatures: {len(df.columns)}\nentries: {len(df)}")
    for attr in attributes.keys():
        try:
            (f"{attr}: {df[attr].unique()[:(min(10, len(df[attr].unique())))]}")
        except:
            print(f"missing {attr} for 202{i}")
df = pd.concat(dataframes)
df = df.loc[:, list(attributes.keys())]
df.rename(mapper=attributes, inplace=True, axis=1)
list(df.columns)

YEAR 2022
features: 637
entries: 27651
YEAR 2023
features: 647
entries: 29522


['weight',
 'height',
 'pregnant',
 'age',
 'gender',
 'education',
 'single',
 'owns_home',
 'parent',
 'depression',
 'anxiety',
 'poverty',
 'health_status',
 'life_sat',
 'insurance',
 'living_area',
 'region',
 'cancer',
 'hypertension',
 'cholesterol',
 'asthma',
 'race',
 'medical_bill_worry']

In [3]:
df

Unnamed: 0,weight,height,pregnant,age,gender,education,single,owns_home,parent,depression,...,life_sat,insurance,living_area,region,cancer,hypertension,cholesterol,asthma,race,medical_bill_worry
0,148,68,,85,1,4,3,1,3,5,...,2,2,2,3,2,1,2,2,1,3
1,235,74,,64,1,8,1,1,3,5,...,2,2,4,3,1,1,1,2,1,3
2,218,69,2.0,37,2,8,1,1,3,4,...,1,2,4,3,2,2,2,1,1,3
3,240,64,,72,2,5,2,1,3,5,...,1,2,4,3,2,1,2,1,1,3
4,183,66,,84,2,6,3,1,3,5,...,2,2,1,3,2,1,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29517,190,61,,77,2,5,3,1,3,3,...,2,2,4,4,2,1,1,1,1,2
29518,165,65,,59,2,7,1,1,3,4,...,1,2,4,4,2,2,1,2,1,3
29519,220,66,,66,1,8,1,1,3,4,...,2,2,4,4,2,2,2,1,1,2
29520,170,65,,53,2,7,1,1,3,5,...,1,2,4,4,2,1,2,1,1,3


In [4]:
df = df[df["weight"] <= 299]
df = df[df["height"] <= 76]
df = df[((df["pregnant"] == 2) | (df["gender"] == 1))]
df = df[df["age"] <= 84]
df = df[df["gender"] <= 2]
df = df[df["education"] <= 10]
df = df[df["single"] <= 3]
df = df[df["owns_home"] <= 3]
df = df[df["parent"] <= 3]
df = df[df["depression"] <= 5]
df = df[df["anxiety"] <= 5]
df = df[df["poverty"] < 11]
df = df[df["health_status"] < 6]
df = df[df["life_sat"] < 5]
#df = df[df["alcohol"] < 366]
#df = df[df["sleep"] < 25]
df = df[df["insurance"] <= 2]
# living area is good as it is
# region too
df = df[df["cancer"] <= 2]
df = df[df["hypertension"] <= 2]
df = df[df["cholesterol"] <= 2]
df = df[df["asthma"] <= 2]
df = df[df["race"] <= 6]
df = df[df["medical_bill_worry"] <= 3]

In [5]:
# BMI calculation
POUND_TO_KG = 0.453592
INCH_TO_M = 0.0254
df["weight"] = df["weight"] * POUND_TO_KG
df["height"] = df["height"] * INCH_TO_M
df["bmi"] = df["weight"] / df["height"] ** 2

del df["weight"]
del df["height"]
del df["pregnant"]

In [6]:
def get_optimal_cluster_number(df: pd.DataFrame, feature: str, k_number: int) -> int:
    inertia = []
    k_values = np.arange(1, k_number+1)

    for k in k_values:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(df[[feature]])
        inertia.append(kmeans.inertia_)

    knee_locator = KneeLocator(k_values, inertia, curve='convex', direction='decreasing')
    k = knee_locator.knee


    """fig, ax = plt.subplots(1, figsize=(8, 6))
    ax.plot(k_values, inertia, marker='o')
    ax.set(xlabel='Number of clusters (k)', ylabel='Inertia', title=f'Elbow Method for Optimal k for {feature}')
    ax.axvline(x=elbow_point, c="red")
    ax.set_xticks(k_values) """

    return k, k_values, inertia

def plot_cluster(df: pd.DataFrame, feature:str, k_values: np.ndarray, inertia: np.ndarray, k: int):
    fig, axs = plt.subplots(2, figsize=(8, 6))
    axs[0].plot(k_values, inertia, marker='o')
    axs[0].set(xlabel='Number of clusters (k)', ylabel='Inertia', title=f'Elbow Method for Optimal k for {feature}')
    axs[0].axvline(x=k, c="red")
    axs[0].set_xticks(k_values)

    axs[1].scatter(df[feature], df[f'{feature}_cat'], c=df[f'{feature}_cat'], cmap='viridis')
    axs[1].set_title(f"{feature} Categories using K-Means Clustering")
    axs[1].set_xlabel(feature)
    axs[1].set_ylabel("Category")

    
CONTINUOUS_FEATURES = ["age", "bmi", "poverty"]
for feature in CONTINUOUS_FEATURES:
    k, k_values, inertia = get_optimal_cluster_number(df, feature, 10)
    kmeans = KMeans(n_clusters=k, n_init=5, random_state=42)
    df[f"{feature}_cat"] = kmeans.fit_predict(df[[feature]])

In [7]:
# regrouping

"""
EDUCATION
00 (no edu), 01 (no hs), 02 (no diploma): no_hs
03 (ged), 04 (hs), 05 (no deg): highschool
06 (occupational deg), 07 (academic deg), 08 (bsc): bachelor
09 (msc): masters
10 (phd): phd
"""

"""
MARITAL status:
03: single
01 (married) 2 (cohabiting): not
"""

"""
parent
01: parent
02 (in famility but not) 03 (not in family): not
"""


def invert_encoding_order(df: pd.DataFrame, feature: str) -> None:
    """Reorders the encoding of a categorical variable, 
    so that the lowest values are encoded 0 and higher ones ascending from here."""
    df[feature] = df[feature].max() - df[feature]

def offset_encoding_to_zero(df: pd.DataFrame, feature: str) -> None:
    """Re-encodes `1` and `2` category labels into `0` and `1`"""
    df[feature] = df[feature] - 1


offset_encoding_to_zero(df, "gender") 

class EduLevel(IntEnum):
    NO_HIGHSCHOOL = 0
    HIGHSCHOOL = 1
    BACHELORS = 2
    MASTERS = 3
    PHD = 4
    
df["education"] = df["education"].replace({
    1: EduLevel.NO_HIGHSCHOOL,
    2: EduLevel.NO_HIGHSCHOOL,
    3: EduLevel.HIGHSCHOOL,
    4: EduLevel.HIGHSCHOOL,
    5: EduLevel.HIGHSCHOOL,
    6: EduLevel.BACHELORS,
    7: EduLevel.BACHELORS,
    8: EduLevel.BACHELORS,
    9: EduLevel.MASTERS,
    10: EduLevel.PHD
})

df["single"] = df["single"].replace({
    1: 0,
    2: 0,
    3: 1
})

df["owns_home"] = df["owns_home"].replace({
    1: 1,
    2: 0,
    3: 0,
})

df["parent"] = df["parent"].replace({
    1: 1,
    2: 0,
    3: 0
})


invert_encoding_order(df, "depression")

invert_encoding_order(df, "anxiety")

invert_encoding_order(df, "health_status")

invert_encoding_order(df, "life_sat")

offset_encoding_to_zero(df, "insurance") 

invert_encoding_order(df, "living_area")

offset_encoding_to_zero(df, "region") 

invert_encoding_order(df, "cancer")

invert_encoding_order(df, "hypertension")

invert_encoding_order(df, "cholesterol")

invert_encoding_order(df, "asthma")

offset_encoding_to_zero(df, "race") 

invert_encoding_order(df, "medical_bill_worry")

df["age_cat"] = df["age_cat"].replace({
    1: 0,
    2: 1,
    0: 2,
})

df["bmi_cat"] = df["bmi_cat"].replace({
    2: 0,
    0: 1,
    1: 2,
})

### Drop continuous data

In [8]:
df.drop(CONTINUOUS_FEATURES, inplace=True, axis=1)

## Crosstables

In [9]:
""" target = "anxiety"

for f in df.columns.values:
    tab = pd.crosstab(df[f], df[target], normalize="index") * 100
    plt.figure(figsize=(6, 3))
    sns.heatmap(tab, fmt='.2f', cmap='coolwarm', annot=True)
    plt.show() """

' target = "anxiety"\n\nfor f in df.columns.values:\n    tab = pd.crosstab(df[f], df[target], normalize="index") * 100\n    plt.figure(figsize=(6, 3))\n    sns.heatmap(tab, fmt=\'.2f\', cmap=\'coolwarm\', annot=True)\n    plt.show() '

### Chi-square tests

In [10]:
X = df.drop(["anxiety"], axis=1)
y = df['anxiety']

#chi2_scores, p_values = chi2(X, y)


#Calculate Cramér's V for two categorical variables
def cramers_v(x, y):    
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    return np.sqrt(phi2 / min(r - 1, k - 1))

feature_names = X.columns
chi2_scores = []
p_values = []
cramers_v_values = []

# Calculate Chi-Square, p-value, and Cramér's V for each feature
for feature in feature_names:
    chi2_score, p_value = chi2(X[[feature]], y)
    chi2_scores.append(chi2_score[0])  # chi2() returns an array
    p_values.append(p_value[0])       # chi2() returns an array
    v = cramers_v(X[feature], y)
    cramers_v_values.append(v)


results = pd.DataFrame({
    'feature': feature_names,
    'chi': chi2_scores,
    'p': p_values,
    'cramers_v': cramers_v_values
}).sort_values(by='p', ignore_index=True)

print(results)

alpha = 0.05
cramers = 0.1
significant_features = results[(results["p"] < alpha) & (results["cramers_v"] > cramers)]["feature"].values
insignificant_features =  results[(results["p"] >= alpha) | (results["cramers_v"] <= cramers)]["feature"].values

df.drop(insignificant_features, inplace=True, axis=1)

               feature           chi              p  cramers_v
0           depression  15793.186491   0.000000e+00   0.365245
1              age_cat   1425.727603  1.823056e-307   0.203438
2   medical_bill_worry   1251.239725  1.240938e-269   0.176973
3               gender   1104.225585  9.189134e-238   0.241981
4        health_status    361.803253   4.957452e-77   0.107268
5               asthma    360.400786   9.956869e-77   0.122311
6             life_sat    345.419779   1.709514e-73   0.181856
7            education    198.090764   9.667944e-42   0.065800
8         hypertension    178.677826   1.433765e-37   0.095869
9            owns_home    155.439801   1.389187e-32   0.124100
10         poverty_cat    136.038180   1.989087e-28   0.071849
11              single    119.390361   7.208863e-25   0.087790
12         cholesterol    107.485402   2.501311e-22   0.072131
13              parent     99.974694   9.959406e-21   0.071301
14              cancer     98.369083   2.187737e-20   0

In [11]:
df

Unnamed: 0,gender,owns_home,depression,anxiety,health_status,life_sat,asthma,medical_bill_worry,age_cat
1,0,1,0,1,2,2,0,0,2
2,1,1,1,2,2,3,1,0,1
10,0,1,1,1,2,2,0,0,2
11,0,1,0,0,4,3,0,0,2
15,1,0,4,4,2,1,0,1,0
...,...,...,...,...,...,...,...,...,...
29513,1,0,2,3,2,2,1,1,0
29514,1,1,1,2,4,3,0,1,0
29515,0,1,0,2,3,3,0,0,1
29516,1,1,0,3,2,3,0,2,1


In [12]:
one_hot_columns = [column for column in significant_features if len(df[column].unique()) > 2] 
df = pd.get_dummies(df, columns=one_hot_columns)

## Model training

In [13]:
X = df.drop(["anxiety"], axis=1)
y = df["anxiety"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

""" pipeline = Pipeline([
        ('classifier', RandomForestClassifier(random_state=42))
])

param_distributions = {
    'classifier__n_estimators': [50, 150, 300, 500],  
    'classifier__max_depth': [None, 10, 20, 50],
    'classifier__min_samples_split': [2, 5, 10, 20],    
    'classifier__min_samples_leaf': [1, 2, 5, 10, 20],  
    'classifier__max_features': ['sqrt', 'log2', None, 0.5, 0.75],  
    'classifier__bootstrap': [True, False],             
    'classifier__criterion': ['gini', 'entropy'],       
    'classifier__class_weight': [None, 'balanced', 'balanced_subsample'],  
    'classifier__oob_score': [True, False],             
    'classifier__warm_start': [True, False]        
}

search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=3500,           # Number of random samples
    scoring='f1_macro',  
    cv=3,    
    verbose=0,
    n_jobs=-1
)

search.fit(X_train, y_train)

print("Best Parameters:", search.best_params_)
print("Best Score:", search.best_score_)

best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

# Analyze feature importances
feature_importances = best_model.named_steps['classifier'].feature_importances_

# Pair each feature importance with the feature name
feature_names = X.columns
feature_importances_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
print("Feature Importances:\n", feature_importances_df.sort_values(by='Importance', ascending=False))

# Calculate metrics on the training data to check for overfitting
train_pred = best_model.predict(X_train)
train_report = classification_report(y_train, train_pred)
print("Training Classification Report:\n", train_report) """


""" smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train) """

model = RandomForestClassifier(
    bootstrap=True,
    random_state=42,
    class_weight='balanced',
    criterion="gini",
    max_depth=10,
    max_features=0.75,
    min_samples_split=10,
    min_samples_leaf=20,
    n_estimators=500,
    warm_start=False,
    oob_score=True    
)

# model = KNeighborsClassifier(n_neighbors=11 )

""" model = MLPClassifier(
    hidden_layer_sizes=(10, 8),  # Two hidden layers with 5 and 4 neurons
    activation='relu',           # Activation function for hidden layers
    solver='adam',               # Optimizer
    max_iter=250,                # Maximum iterations
    random_state=42,              # For reproducibility
    alpha = 0.0001,
    batch_size = 64,
    learning_rate='adaptive',
    learning_rate_init=0.001  
) """

#model = SVC(kernel="rbf", C=1, gamma="scale", decision_function_shape="ovr")

""" model = XGBClassifier(
    random_state = 42,
    learning_rate=0.2
) """

""" model = HistGradientBoostingClassifier(
    random_state=42,
    max_depth=3, 
    max_iter=400
) """

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_true=y_test, y_pred=y_pred))

# For RandomForestClassifier
feature_importances = model.feature_importances_

# Pair each feature importance with the feature name
feature_names = X.columns
feature_importances_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
print("Feature Importances:\n", feature_importances_df.sort_values(by='Importance', ascending=False, ignore_index=True))

# Calculate metrics on the training data to check for overfitting
train_pred = model.predict(X_train)
train_report = classification_report(y_train, train_pred)
print("Training Classification Report:\n", train_report)


2571 fits failed out of a total of 10500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2571 fits failed with the following error:
Traceback (most recent call last):
  File "d:\OneDrive\Dokumenti\ITU\III.semester\DAMIN\Assignments Git Repository\datamining-2\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\OneDrive\Dokumenti\ITU\III.semester\DAMIN\Assignments Git Repository\datamining-2\.venv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\OneDrive\Dokumenti\ITU\III.semester\DAMIN\Assignments Git Repository\datami

Best Parameters: {'classifier__warm_start': False, 'classifier__oob_score': True, 'classifier__n_estimators': 500, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 20, 'classifier__max_features': 0.75, 'classifier__max_depth': 10, 'classifier__criterion': 'gini', 'classifier__class_weight': 'balanced', 'classifier__bootstrap': True}
Best Score: 0.40831495437683
              precision    recall  f1-score   support

           0       0.53      0.77      0.63      1591
           1       0.47      0.28      0.35      1650
           2       0.20      0.28      0.23       678
           3       0.34      0.24      0.28       910
           4       0.54      0.50      0.52       789

    accuracy                           0.44      5618
   macro avg       0.42      0.41      0.40      5618
weighted avg       0.44      0.44      0.43      5618

Feature Importances:
                  Feature  Importance
3           depression_0    0.372171
4           depression_1    0.1

' model.fit(X_train, y_train)\ny_pred = model.predict(X_test)\nprint(classification_report(y_true=y_test, y_pred=y_pred))\n\n# For RandomForestClassifier\nfeature_importances = model.feature_importances_\n\n# Pair each feature importance with the feature name\nfeature_names = X.columns\nfeature_importances_df = pd.DataFrame({\'Feature\': feature_names, \'Importance\': feature_importances})\nprint("Feature Importances:\n", feature_importances_df.sort_values(by=\'Importance\', ascending=False, ignore_index=True))\n\n# Calculate metrics on the training data to check for overfitting\ntrain_pred = model.predict(X_train)\ntrain_report = classification_report(y_train, train_pred)\nprint("Training Classification Report:\n", train_report)\n '