Importing Packages

In [1]:
import numpy as np
import pandas as pd
from sklearn_extra.cluster import KMedoids
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from itertools import islice
from sklearn.metrics.pairwise import euclidean_distances
import random
import warnings
import math
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings('ignore')
warnings.simplefilter("ignore", category=DeprecationWarning)
from lasso_regression import lasso_train,lasso_test,re_train_lasso
from linear_regression import lr_train,lr_test,re_train_lr
from rf_regression import rf_train,rf_test,re_train_rf
from dt_regression import dt_train,dt_test,re_train_dt
from adaboost_regression import ab_train,ab_test,re_train_ab
from train_test_split_pr import train_test_split_dataset_personalized 

Dividing into chunks

In [2]:
def chunk(seq, size):
    chunks=np.array_split(seq,size)
    return chunks

Data Preprocessing

In [3]:
def heart_disease(df):
    random.seed(2411)
    df.cp = pd.Categorical(df.cp)
    df.restecg = pd.Categorical(df.restecg)
    df.slope = pd.Categorical(df.slope)
    df.ca = pd.Categorical(df.ca)
    df.thal = pd.Categorical(df.thal)

    df = pd.get_dummies(df, drop_first=True)

    df = df[['age', 'sex', 'trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak',
             'cp_1', 'cp_2', 'cp_3', 'restecg_1', 'restecg_2', 'slope_1',
             'slope_2', 'ca_1', 'ca_2', 'ca_3', 'ca_4', 'thal_1', 'thal_2',
             'thal_3','target']]
    return df

In [4]:
def breast_cancer(df):
    
    df=df.drop('Unnamed: 32',axis=1)
    labels=df['diagnosis']
    data=df.drop('id',axis=1)
    data = data.drop('diagnosis', axis=1)
    labels = labels.map({'B': 0, 'M': 1})
    data['labels']=labels
    return data   

In [5]:
def liver_disease(df):
    data2 = df.copy(deep=True)
    numerical_features = data2.drop(['Dataset', 'Gender'], axis=1)
    label = data2.Dataset.map({1:0, 2:1})
    numerical_features.Albumin_and_Globulin_Ratio.fillna(0, inplace=True)
    scaler = StandardScaler()
    scaled = scaler.fit_transform(numerical_features)
    dataframe = pd.DataFrame(scaled, columns=numerical_features.columns)
    dataframe['Gender'] = data2.Gender
    data3 = pd.get_dummies(dataframe)
    data3['labels']=label
    
    return data3

Clustering

In [6]:
def clustering(df,dataset,num_of_clusters):
    random.seed(2411)
    print(df.index)
    persons=[]
    if dataset == "heart_disease":
        new_df = df.drop('target', axis=1)
    elif dataset == "breast_cancer":
        new_df = df.drop('labels', axis=1)
        
    elif dataset == "liver_disease":
        new_df = df.drop('labels', axis=1)
    
    elif dataset == "thyroid":
        new_df = df.drop('classes', axis=1)
    elif dataset == "diabetes":
        new_df = df.drop('Outcome', axis=1)
        
    
    km = KMedoids(n_clusters=num_of_clusters)
    km = km.fit(new_df)
    cluster_centers = km.cluster_centers_
    transform_centers=km.transform(cluster_centers)

    #print("Cluster Centers",cluster_centers)
    #print("Transform_centers:",transform_centers)

    
    for j in range(num_of_clusters):
        l = [df.iloc[i] for i in range(len(new_df)) if km.labels_[i]==j]
        ldf = pd.DataFrame(l)
        
        persons.append(ldf)
    
  
    
    test_centers, train_centers = np.split(cluster_centers, [int(0.3 * len(cluster_centers))])
    tmp = [persons.pop() for _ in range(int(0.8*len(persons)))]
    test,train = persons, [tmp.pop() for _ in range(len(tmp))]
    return test_centers, train_centers,train,test,km

Personalized Regression Approach

In [7]:
def personalized_incremental_regressors(train,test):
    random.seed(2411)
    
    regressors=['Lasso Regression','Linear Regression','Random Forest Regression', 'Decision Tree Regression','AdaBoost Regression']
    mae_list=[]
    models_list=[]
    min_person_list=[]
    
    lasso_mae_list = []
    lasso_models_list=[]
    lasso_min_person_list=[]
    
    lr_mae_list = []
    lr_models_list=[]
    lr_min_person_list=[]
    

    rf_mae_list = []
    rf_models_list=[]
    rf_min_person_list=[]
        
    dt_mae_list = []
    dt_models_list=[]
    dt_min_person_list=[]
    
    ab_mae_list = []
    ab_models_list=[]
    ab_min_person_list=[]
    
    for i in train:
        train_df = pd.DataFrame(i)          
        x_train, y_train = train_test_split_dataset_personalized(dataset,train_df)
        #lasso_regression
        lasso_models = lasso_train(x_train,y_train)
        lasso_models_list.append(lasso_models)
        #linear_regression
        lr_models = lr_train(x_train,y_train)
        lr_models_list.append(lr_models)
        #random_forest_regression
        rf_models=rf_train(x_train,y_train)
        rf_models_list.append(rf_models)
        #dt_regression
        dt_models=dt_train(x_train,y_train)
        dt_models_list.append(dt_models)
        #adaboost_regression
        ab_models=ab_train(x_train,y_train)
        ab_models_list.append(ab_models)
        
    for j in test:
        test_df = pd.DataFrame(j)
        chunks = chunk(test_df,4)
        for i in range(len(chunks)):
            x_test,y_test=train_test_split_dataset_personalized(dataset,chunks[i])
            if i == 0:
                
                #lasso_regression
                lasso_mae,lasso_min_person,lasso_minpos= lasso_test(x_test,y_test,lasso_models_list)
                lasso_mae_list.append(lasso_mae)
                lasso_min_person_list.append(lasso_min_person)
                #linear_regression
                lr_mae,lr_min_person,lr_minpos= lr_test(x_test,y_test,lr_models_list)
                lr_mae_list.append(lr_mae)
                lr_min_person_list.append(lr_min_person)
                #ranfom_forest_regression
                rf_mae,rf_min_person,rf_minpos= rf_test(x_test,y_test,rf_models_list)
                rf_mae_list.append(rf_mae)
                rf_min_person_list.append(rf_min_person)
                #decision_tree_regression
                dt_mae,dt_min_person,dt_minpos= dt_test(x_test,y_test,dt_models_list)
                dt_mae_list.append(dt_mae)
                dt_min_person_list.append(dt_min_person)
                #adaboost regression
                ab_mae,ab_min_person,ab_minpos= ab_test(x_test,y_test,ab_models_list)
                ab_mae_list.append(ab_mae)
                ab_min_person_list.append(ab_min_person)
                
            if i > 0:
                
                #re-train after previous chunk of data is added
                                
                #lasso_regression
                if i==1:
                    ls_train = train

                lasso_mae,lasso_min_person,lasso_minpos,lasso_models_list,ls_train = re_train_lasso(i,dataset,ls_train,chunks,lasso_minpos,lasso_models_list,x_test,y_test)              
                
                lasso_mae_list.append(lasso_mae)
                lasso_min_person_list.append(lasso_min_person)


                #linear_regression
                if i==1:
                    l_train = train
                lr_mae,lr_min_person,lr_minpos,lr_models_list,l_train=re_train_lr(i,dataset,l_train,chunks,lr_minpos,lr_models_list,x_test,y_test)
                
                lr_mae_list.append(lr_mae)
                lr_min_person_list.append(lr_min_person)



                #random_forest_regression
                if i==1:
                    r_train = train
                
                rf_mae,rf_min_person,rf_minpos,rf_models_list,r_train=re_train_rf(i,dataset,r_train,chunks,rf_minpos,rf_models_list,x_test,y_test)
                
                rf_mae_list.append(rf_mae)
                rf_min_person_list.append(rf_min_person)


                #decision_tree_regression
                if i==1:
                    d_train = train
                dt_mae,dt_min_person,dt_minpos,dt_models_list,d_train=re_train_dt(i,dataset,d_train,chunks,dt_minpos,dt_models_list,x_test,y_test)
                
                dt_mae_list.append(dt_mae)
                dt_min_person_list.append(dt_min_person)
                
                #dadaboost_regression
                if i==1:
                    a_train = train
                ab_mae,ab_min_person,ab_minpos,ab_models_list,a_train=re_train_ab(i,dataset,a_train,chunks,ab_minpos,ab_models_list,x_test,y_test)
                
                ab_mae_list.append(ab_mae)
                ab_min_person_list.append(ab_min_person)

        mae_list.append(lasso_mae_list)
        mae_list.append(lr_mae_list)
        mae_list.append(rf_mae_list)
        mae_list.append(dt_mae_list)
        mae_list.append(ab_mae_list)


        models_list.append(lasso_models_list)
        models_list.append(lr_models_list)
        models_list.append(rf_models_list)
        models_list.append(dt_models_list)
        models_list.append(ab_models_list)
        
        min_person_list.append(lasso_min_person_list)
        min_person_list.append(lr_min_person_list)
        min_person_list.append(rf_min_person_list)
        min_person_list.append(dt_min_person_list)
        min_person_list.append(ab_min_person_list)
        
        mae_dict=dict(zip(regressors,mae_list))
        models_dict=dict(zip(regressors,models_list))
        min_person_dict=dict(zip(regressors,min_person_list))
        
    return mae_dict,models_dict,min_person_dict



Main Function

In [8]:
if __name__== "__main__":
    random.seed(2411)
    
    train_person_auc=[]
    train_person_mae=[]
    dataset=input("enter the dataset:")
    num_of_clusters = int(input("enter the number of persons:"))
    if dataset == "heart_disease":
        df = pd.read_csv("heart.csv")
        df=heart_disease(df)
    elif dataset == "breast_cancer":
        df = pd.read_csv("data.csv")
        df=breast_cancer(df)
    elif dataset == "liver_disease":
        df = pd.read_csv("indian_liver_patient.csv")
        df=liver_disease(df)
    elif dataset == "thyroid":  
        df = pd.read_csv("thyroid_dataset.csv")
    elif dataset == "diabetes":  
        df = pd.read_csv("diabetes.csv")
        
    test_centers, train_centers,train,test,km  = clustering(df,dataset,num_of_clusters)

    mae_list,models,min_person_list = personalized_incremental_regressors(train,test)

enter the dataset:heart_disease
enter the number of persons:5
RangeIndex(start=0, stop=303, step=1)


In [9]:
mae_list

{'Lasso Regression': [{'person1': 0.1969339119435833,
   'person2': 0.436569182335927,
   'person3': 0.6276060691082495,
   'person4': 0.7734556568187153},
  {'person1': 0.6981295349907333,
   'person2': 0.3815787073068186,
   'person3': 0.3476804975198142,
   'person4': 0.20282991257654373,
   'person5': 0.8232246666985719},
  {'person1': 0.7459251221408759,
   'person2': 0.3779641449431201,
   'person3': 0.2949413936209771,
   'person4': 0.20443190298888828,
   'person5': 0.8975349637371128,
   'person6': 0.18715592967802905},
  {'person1': 0.818848763491461,
   'person2': 0.4004822287994389,
   'person3': 0.2956990329121153,
   'person4': 0.1949488863148631,
   'person5': 0.9811101489130676,
   'person6': 0.1849531709500756,
   'person7': 0.4366361101885789}],
 'Linear Regression': [{'person1': 0.358034361299505,
   'person2': 0.3237939846043541,
   'person3': 0.4819506207474626,
   'person4': 0.46746947295039154},
  {'person1': 0.6072367236223215,
   'person2': 0.18480998472108837,

In [10]:
min_person_list

{'Lasso Regression': ['person1', 'person4', 'person6', 'person6'],
 'Linear Regression': ['person2', 'person2', 'person4', 'person7'],
 'Random Forest Regression': ['person2', 'person4', 'person6', 'person6'],
 'Decision Tree Regression': ['person3', 'person4', 'person4', 'person2'],
 'AdaBoost Regression': ['person3', 'person4', 'person6', 'person6']}