In [64]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix, roc_auc_score,classification_report,roc_curve, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.exceptions import NotFittedError
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import random
from sklearn.decomposition import PCA

In [57]:
class preprocessing:
    def __init__(self,df):
        self.raw_df = df.copy()

    def check_null_values(self, col):
        if df[col].isnull().sum() != 0:
            # Drop na value
            clean_df = self.raw_df.dropna(subset=col)
            self.clean_df = clean_df
            assert clean_df[col].isnull().sum() == 0
        return clean_df
    
    def converted_label(self):
        # Label converting to numeric ##
        label_dict  = {"WES": 0,
                "panel": 1}

        self.clean_df["label"]= self.clean_df["label"].replace(label_dict)

        assert self.clean_df["label"].dtypes == "int64"

        return self.clean_df
    

def split_pt(df, training_ratio):
    mrn_unique = pd.DataFrame(df["EMPI"].unique(), columns=["p_id"])
    p_id_train = mrn_unique.sample(n=int(len(mrn_unique)*training_ratio), replace=False) # 0.7 --> training ratio
    p_id_test = mrn_unique.drop(index=p_id_train.index.values)

    test_df = df.merge(p_id_test, right_on="p_id", left_on="EMPI")
    train_df = df.merge(p_id_train, right_on="p_id", left_on="EMPI")
    
    tf_idf = TfidfVectorizer()
    X_tf_idf_train = tf_idf.fit_transform(train_df["text"])
    X_tf_idf_test = tf_idf.transform(test_df["text"])

    return X_tf_idf_train, X_tf_idf_test, train_df, test_df


def aggregate_by_mean(df,X_tf_idf):
    aggregate_tfidf =  np.array([[]])
    y = []
    r = 0
    for pt in df["EMPI"].unique():
        pt_tfidf = X_tf_idf[df["EMPI"] == pt]
        ## Aggregation: Mean
        y.append(df[df["EMPI"] == pt]["label"].unique()[0])
        if r < 1:
            aggregate_tfidf = np.array(pt_tfidf.mean(axis=0))
        else:
            aggregate_tfidf = np.concatenate((aggregate_tfidf, np.array(pt_tfidf.mean(axis=0))),axis=0)
        r+=1
    y = np.array(y)
    return aggregate_tfidf, y


def aggregate_main(X_tf_idf_train, X_tf_idf_test, train_df, test_df, method):
    if method == "mean":
        aggregate_tfidf_train, y_train = aggregate_by_mean(train_df, X_tf_idf_train)
        aggregate_tfidf_test, y_test = aggregate_by_mean(test_df, X_tf_idf_test)
    return  aggregate_tfidf_train, y_train, aggregate_tfidf_test, y_test




In [33]:
# Loading the dataset
df = pd.read_csv("exported_data/df_withNotes.csv")

# Preprocessing
preprocessor = preprocessing(df)
clean_df = preprocessor.check_null_values("text")
clean_df = preprocessor.converted_label()
clean_df.reset_index(drop=True,inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.clean_df["label"]= self.clean_df["label"].replace(label_dict)


In [58]:
X_tf_idf_train, X_tf_idf_test, train_df, test_df = split_pt(clean_df, 0.7)
aggregate_tfidf_train, y_train, aggregate_tfidf_test, y_test = aggregate_main(X_tf_idf_train, X_tf_idf_test, train_df, test_df, "mean")

In [70]:
def model_training(model, param_grid,X_train, y_train, X_test, y_test):
    pipe = Pipeline([
                      ("scaler", StandardScaler(with_mean=False)),
                    ("pca",PCA()),
                      ("clf", RandomForestClassifier())])
    search = GridSearchCV(pipe, param_grid, n_jobs=4, cv=3)
    search.fit(X_train, y_train)
    print(search.best_estimator_)
    print(search.best_params_)
    print(search.best_score_)

    ########### Predict ##################
    y_pred = search.predict(X_test)
    y_pred_prob = search.predict_proba(X_test)

    ########### Evaluation ##################
    performance_dict = {}
    performance_dict["f1"] = f1_score(y_test, y_pred)
    performance_dict["roc_panel"] = roc_auc_score(y_test, y_pred_prob[:,1])
    print(classification_report(y_test, y_pred,target_names=["WES","panel"]))

    ## Evluation metrics: roc, precision recall curve,  

    return performance_dict

param_grid = {"pca__n_components": [2, 10, 20],
            "clf__max_depth":[10, 20, 30, 40, 50, None],
              "clf__n_estimators": [100, 200,300]}

model_training(RandomForestClassifier(),
param_grid,aggregate_tfidf_train, y_train, aggregate_tfidf_test, y_test )


Pipeline(steps=[('scaler', StandardScaler(with_mean=False)),
                ('pca', PCA(n_components=10)),
                ('clf',
                 RandomForestClassifier(max_depth=50, n_estimators=200))])
{'clf__max_depth': 50, 'clf__n_estimators': 200, 'pca__n_components': 10}
0.570446735395189
              precision    recall  f1-score   support

         WES       0.56      0.90      0.69        70
       panel       0.50      0.12      0.20        56

    accuracy                           0.56       126
   macro avg       0.53      0.51      0.45       126
weighted avg       0.53      0.56      0.47       126



{'f1': 0.2, 'roc_panel': 0.5228316326530612, 'roc_wes': 0.47716836734693885}