In [None]:
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import mutual_info_classif,RFE
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV, LogisticRegressionCV
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,roc_auc_score

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("vrushankts/covid-19")

print("Path to dataset files:", path)
import os

# Specify the directory containing the dataset files
dataset_dir = path  # This is the directory path

# List all files in the directory to identify the correct CSV file
files = os.listdir(dataset_dir)
print("Files in the dataset directory:", files)

# Assuming the file you need is named 'covid19_dataset.csv' (replace with actual filename if different)
csv_file = [f for f in files if f.endswith('.csv')][0]  # Get the first CSV file (adjust if necessary)
csv_path = os.path.join(dataset_dir, csv_file)

# Load the CSV file into a Pandas DataFrame
cols=pd.read_csv(csv_path).columns
data = pd.read_csv(csv_path,skiprows=1,header=None)
data=data.drop(data.columns[-1],axis=1)
data.columns=cols

Downloading from https://www.kaggle.com/api/v1/datasets/download/vrushankts/covid-19?dataset_version_number=1...


100%|██████████| 38.9M/38.9M [00:00<00:00, 67.0MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/vrushankts/covid-19/versions/1
Files in the dataset directory: ['master_dataset.csv']


  cols=pd.read_csv(csv_path).columns
  data = pd.read_csv(csv_path,skiprows=1,header=None)


**Data Cleaning**

In [None]:
data['height']=pd.to_numeric(data['height'], errors='coerce')
data['weight']=pd.to_numeric(data['weight'], errors='coerce')
height=data['height'].mean()
weight=data['weight'].mean()
age=data['age'].mode()[0]
data['height']=data['height'].fillna(height)
data['weight']=data['weight'].fillna(weight)
data['bmi'] = data['bmi'].fillna(data['weight'] / (data['height'] / 100) ** 2)
data['age']=data['age'].fillna(age)

In [None]:
data=data[['sex','age','bmi','smoking','alcohol','cannabis','amphetamines','cocaine','contacts_count','working',
        'rate_reducing_risk_single','rate_reducing_mask','covid19_symptoms','covid19_contact','asthma','kidney_disease',
        'liver_disease','compromised_immune','heart_disease','lung_disease','diabetes','hiv_positive','hypertension',
        'other_chronic','nursing_home','health_worker','covid19_positive']]

In [None]:
rate_reducing_mask=data['rate_reducing_mask'].mode()[0]
data['rate_reducing_mask']=data['rate_reducing_mask'].fillna(rate_reducing_mask)
working=data['working'].mode()[0]
data['working']=data['working'].fillna(working)
smoking=data['smoking'].mode()[0]
data['smoking']=data['smoking'].fillna(smoking)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['rate_reducing_mask']=data['rate_reducing_mask'].fillna(rate_reducing_mask)


In [None]:
data['alcohol']=data['alcohol'].fillna(data['alcohol'].mode()[0])
data['cannabis']=data['cannabis'].fillna(data['cannabis'].mode()[0])
data['amphetamines']=data['amphetamines'].fillna(data['amphetamines'].mode()[0])
data['cocaine']=data['cocaine'].fillna(data['cocaine'].mode()[0])
data['contacts_count']=data['contacts_count'].fillna(data['contacts_count'].mode()[0])

**Data Encoding**

In [None]:
data['age']=data['age'].apply(lambda x: int(sum(map(int, x.split('_'))) / 2))

In [None]:
scaler=MinMaxScaler()
data['bmi']=scaler.fit_transform(data[['bmi']])
data['age']=scaler.fit_transform(data[['age']])
data['alcohol']=scaler.fit_transform(data[['alcohol']])
data['cannabis']=scaler.fit_transform(data[['cannabis']])
data['amphetamines']=scaler.fit_transform(data[['amphetamines']])
data['cocaine']=scaler.fit_transform(data[['cocaine']])
data['contacts_count']=scaler.fit_transform(data[['contacts_count']])
data['rate_reducing_risk_single']=scaler.fit_transform(data[['rate_reducing_risk_single']])
data['rate_reducing_mask']=scaler.fit_transform(data[['rate_reducing_mask']])

In [None]:
# Mapping from Feature Name to Feature Code (based on the image)
feature_mapping = {
    "sex": "SEX",
    "age": "AGE",
    "bmi": "BMI",
    "smoking": "SMK",
    "alcohol": "ALC",
    "cannabis": "CNB",
    "amphetamines": "APT",
    "cocaine": "CCN",
    "contacts_count": "CTC",
    "working": "WKG",
    "rate_reducing_risk_single": "RRR",
    "rate_reducing_mask": "RRM",
    "covid19_symptoms": "CDS",
    "covid19_contact": "CDC",
    "asthma": "AST",
    "kidney_disease": "KDD",
    "liver_disease": "LVD",
    "compromised_immune": "CPI",
    "heart_disease": "HTD",
    "lung_disease": "LGD",
    "diabetes": "DBT",
    "hiv_positive": "HIV",
    "hypertension": "HPT",
    "other_chronic": "OTC",
    "nursing_home": "NSH",
    "health_worker": "HTW",
    "covid19_positive": "COVID-19"
}

# Replace column names in X
data.rename(columns=feature_mapping, inplace=True)

# Print new column names
print("Updated feature names:", data.columns.tolist())


Updated feature names: ['SEX', 'AGE', 'BMI', 'SMK', 'ALC', 'CNB', 'APT', 'CCN', 'CTC', 'WKG', 'RRR', 'RRM', 'CDS', 'CDC', 'AST', 'KDD', 'LVD', 'CPI', 'HTD', 'LGD', 'DBT', 'HIV', 'HPT', 'OTC', 'NSH', 'HTW', 'COVID-19']


In [None]:
data=pd.get_dummies(data,prefix=['SEX','WKG','SMK'])

**Smote**

In [None]:
x=data.drop(['COVID-19'],axis=1)
y=data['COVID-19']

In [None]:
    # Calculate sampling strategy for 1:3 ratio
ratio = 1/3  # Desired ratio of minority to majority class
n_majority = sum(y == 0)
sampling_strategy = int(n_majority * ratio)

# Apply SMOTE
smote = SMOTE(sampling_strategy={1: sampling_strategy}, random_state=42)
x, y = smote.fit_resample(x, y)

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(x, y, test_size=0.30, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for ease of handling feature names
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val = pd.DataFrame(X_val_scaled, columns=X_train.columns)
X_test = pd.DataFrame(X_test_scaled, columns=X_train.columns)

**Model Evaluation Without Feature Selection**

In [None]:
model=LinearSVC(max_iter=10000)
model.fit(X_train,y_train)

y_pred=model.predict(X_test)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=DecisionTreeClassifier()
model.fit(X_train,y_train)

y_pred=model.predict(X_test)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=GradientBoostingClassifier()
model.fit(X_train,y_train)

y_pred=model.predict(X_test)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=LogisticRegression(max_iter=1000)
model.fit(X_train,y_train)

y_pred=model.predict(X_test)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=AdaBoostClassifier()
model.fit(X_train,y_train)

y_pred=model.predict(X_test)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

0.8671212151177738 0.8419366014204054 0.5767656191286118 0.6845694299362683
0.9830014041888337 0.9640205596801827 0.9681387575895417 0.9660752698782341
0.9569349128809603 0.9540060312845768 0.8696675434606332 0.909886608177454
0.8648418804263988 0.8150278041502781 0.5942289816664359 0.6873312897469918
0.9051381444930087 0.8838165137614679 0.7144750113719518 0.7901747632275422


**MIFS Feature selection, Model Evaluation**

In [None]:
def select_features_mifs(X, y, top_k=15):
    mi_scores = mutual_info_classif(X, y, random_state=42)
    mi_series = pd.Series(mi_scores, index=X.columns)
    mi_series = mi_series.sort_values(ascending=False)
    top_features = mi_series.index[:top_k].tolist()
    print("MIFS Top Features:", top_features)
    return top_features

features_mifs = select_features_mifs(X_train, y_train)
X_train_mifs=X_train[features_mifs]
X_test_mifs=X_test[features_mifs]
X_val_mifs=X_val[features_mifs]

MIFS Top Features: ['BMI', 'CTC', 'AGE', 'ALC', 'RRR', 'RRM', 'CDS', 'CNB', 'CDC', 'SEX_male', 'SEX_female', 'SMK_stopped', 'SMK_never', 'SMK_travel critical', 'WKG_never']


In [None]:
model=LinearSVC(max_iter=10000)
model.fit(X_train_mifs,y_train)

y_pred=model.predict(X_test_mifs)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=DecisionTreeClassifier()
model.fit(X_train_mifs,y_train)

y_pred=model.predict(X_test_mifs)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=GradientBoostingClassifier()
model.fit(X_train_mifs,y_train)

y_pred=model.predict(X_test_mifs)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=LogisticRegression(max_iter=1000)
model.fit(X_train_mifs,y_train)

y_pred=model.predict(X_test_mifs)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=AdaBoostClassifier()
model.fit(X_train_mifs,y_train)

y_pred=model.predict(X_test_mifs)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

0.8646836619662599 0.8398335628681103 0.5668374107548998 0.676844963986303
0.9819037636216206 0.9638442216024842 0.9637679726282063 0.9638060956072863
0.9595554061270098 0.9570384109388143 0.8776180210826098 0.9156092025172805
0.8622016098728319 0.8113150602244354 0.5848149832881752 0.6796919894265027
0.9051381444930087 0.8838165137614679 0.7144750113719518 0.7901747632275422


**RFE Feature Selection,Model Evaluation**

In [None]:
def select_features_rfe(X, y, top_k=15):
    # Using Logistic Regression as estimator
    lr_estimator = LogisticRegression(max_iter=1000, random_state=42)
    rfe = RFE(estimator=lr_estimator, n_features_to_select=top_k)
    rfe.fit(X, y)
    top_features = X.columns[rfe.support_].tolist()
    print("RFE Top Features:", top_features)
    return top_features

features_rfe = select_features_rfe(X_train, y_train)
X_train_rfe=X_train[features_rfe]
X_test_rfe=X_test[features_rfe]
X_val_rfe=X_val[features_rfe]

RFE Top Features: ['RRR', 'CDS', 'SEX_female', 'SEX_male', 'WKG_never', 'WKG_quit0', 'WKG_quit10', 'WKG_quit5', 'WKG_vape', 'WKG_yesheavy', 'WKG_yeslight', 'WKG_yesmedium', 'SMK_never', 'SMK_stopped', 'SMK_travel critical']


In [None]:
model=LinearSVC(max_iter=10000)
model.fit(X_train_rfe,y_train)

y_pred=model.predict(X_test_rfe)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=DecisionTreeClassifier()
model.fit(X_train_rfe,y_train)

y_pred=model.predict(X_test_rfe)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=GradientBoostingClassifier()
model.fit(X_train_rfe,y_train)

y_pred=model.predict(X_test_rfe)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=LogisticRegression(max_iter=1000)
model.fit(X_train_rfe,y_train)

y_pred=model.predict(X_test_rfe)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=AdaBoostClassifier()
model.fit(X_train_rfe,y_train)

y_pred=model.predict(X_test_rfe)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

0.824506061744754 0.858359096313912 0.35692106876569824 0.5041906464770632
0.8555959891620355 0.8299397497296462 0.5312382572236616 0.6478149720239244
0.8543747404228388 0.8393236031633768 0.5163459446630936 0.6393613243703241
0.8342463856970512 0.7828801009396686 0.46630935664418643 0.5844819038175508
0.8413958823645749 0.8345701357466063 0.45596582481261 0.5897324397605771


**RidgeCV Feature Selection,Model Evaluation**

In [None]:
def select_features_ridge(X, y, top_k=15):
    # RidgeClassifierCV will select features based on the absolute value of coefficients
    ridge = RidgeClassifierCV(alphas=np.logspace(-6, 6, 13), scoring='accuracy')
    ridge.fit(X, y)
    coefs = np.abs(ridge.coef_[0])
    feature_ranks = pd.Series(coefs, index=X.columns)
    feature_ranks = feature_ranks.sort_values(ascending=False)
    top_features = feature_ranks.index[:top_k].tolist()
    print("RidgeCV Top Features:", top_features)
    return top_features

features_rdg = select_features_ridge(X_train, y_train)
X_train_rdg=X_train[features_rdg]
X_test_rdg=X_test[features_rdg]
X_val_rdg=X_val[features_rdg]

RidgeCV Top Features: ['AGE', 'BMI', 'ALC', 'CNB', 'APT', 'CCN', 'CTC', 'RRR', 'RRM', 'CDS', 'CDC', 'AST', 'KDD', 'LVD', 'CPI']


In [None]:
model=LinearSVC(max_iter=10000)
model.fit(X_train_rdg,y_train)

y_pred=model.predict(X_test_rdg)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=DecisionTreeClassifier()
model.fit(X_train_rdg,y_train)

y_pred=model.predict(X_test_rdg)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=GradientBoostingClassifier()
model.fit(X_train_rdg,y_train)

y_pred=model.predict(X_test_rdg)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=LogisticRegression(max_iter=1000)
model.fit(X_train_rdg,y_train)

y_pred=model.predict(X_test_rdg)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=AdaBoostClassifier()
model.fit(X_train_rdg,y_train)

y_pred=model.predict(X_test_rdg)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

0.8648369360995194 0.8451524698329668 0.5623875165634951 0.6753672410312437
0.9819284852560173 0.966078454751401 0.9614738049561933 0.9637706299251623
0.9599015090085635 0.9566704675028507 0.8794375333742064 0.9164296607724332
0.8629778691928881 0.8182273968024065 0.5809781856298084 0.6794888105013589
0.9101813579099342 0.8982421634910879 0.7225837074540672 0.8008943641903593


**PCA,Model Evaluation**

In [None]:
pca=PCA(n_components=15,random_state=42)
X_train_pca=pca.fit_transform(X_train)
X_test_pca=pca.transform(X_test)
X_val_pca=pca.transform(X_val)

In [None]:
model=LinearSVC(max_iter=10000)
model.fit(X_train_pca,y_train)

y_pred=model.predict(X_test_pca)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=DecisionTreeClassifier()
model.fit(X_train_pca,y_train)

y_pred=model.predict(X_test_pca)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=GradientBoostingClassifier()
model.fit(X_train_pca,y_train)

y_pred=model.predict(X_test_pca)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=LogisticRegression(max_iter=1000)
model.fit(X_train_pca,y_train)

y_pred=model.predict(X_test_pca)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=AdaBoostClassifier()
model.fit(X_train_pca,y_train)

y_pred=model.predict(X_test_pca)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

0.8454798963669086 0.8105960690964069 0.49836837212981827 0.6172443355786895
0.9332120325138935 0.8529383750833412 0.8855289440895516 0.8689281763667062
0.863180586594941 0.8425873267682361 0.5567312066135316 0.6704615824322393
0.846493483377173 0.7854050892073706 0.5310800387635227 0.6336766840111855
0.8500484544034175 0.7809714238107139 0.5561774420030458 0.6496788800073927


**Union of Feature Subsets**

In [None]:
f=features_mifs
e=features_rfe
w=features_rdg
# Pairwise unions
union_f_e = list(set(f) | set(e))
union_f_w = list(set(f) | set(w))
union_e_w = list(set(e) | set(w))

# Union of all three
union_all = list(set(f) | set(e) | set(w))


**FUE**

In [None]:
X_train_subset = X_train.loc[:, union_f_e]
X_val_subset = X_val.loc[:, union_f_e]
X_test_subset = X_test.loc[:, union_f_e]

In [None]:
model=LinearSVC(max_iter=10000)
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=DecisionTreeClassifier()
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=GradientBoostingClassifier()
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=LogisticRegression(max_iter=1000)
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=AdaBoostClassifier()
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

0.8666070051223227 0.8424721189591078 0.5737001364634219 0.6825813283134302
0.9824624725589858 0.9647870615682708 0.9650732749243518 0.9649301470224736
0.9600646717955817 0.9572517112230402 0.8795364199117932 0.9167499819626679
0.8638826810118071 0.8143235350309779 0.5900757470877914 0.6842962317378042
0.9051381444930087 0.8838165137614679 0.7144750113719518 0.7901747632275422


**FUW**

In [None]:
X_train_subset = X_train.loc[:, union_f_w]
X_val_subset = X_val.loc[:, union_f_w]
X_test_subset = X_test.loc[:, union_f_w]

In [None]:
model=LinearSVC(max_iter=10000)
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=DecisionTreeClassifier()
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=GradientBoostingClassifier()
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=LogisticRegression(max_iter=1000)
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=AdaBoostClassifier()
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

0.8649358226371062 0.8388234608208955 0.5690920238118783 0.6781198816972439
0.9817554338152404 0.9629249214845833 0.9641437414710361 0.9635339460420991
0.9590807507465934 0.9551393821978259 0.8775389118525404 0.9146962419344864
0.8626218776575757 0.8101748461245166 0.5883353440262643 0.6816603842760738
0.9051381444930087 0.8838165137614679 0.7144750113719518 0.7901747632275422


**WUE**

In [None]:
X_train_subset = X_train.loc[:, union_e_w]
X_val_subset = X_val.loc[:, union_e_w]
X_test_subset = X_test.loc[:, union_e_w]

In [None]:
model=LinearSVC(max_iter=10000)
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=DecisionTreeClassifier()
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=GradientBoostingClassifier()
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=LogisticRegression(max_iter=1000)
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=AdaBoostClassifier()
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

0.8667405019480648 0.8407365717089502 0.5760931906730218 0.6836990963501937
0.9819729841979313 0.9627927163684429 0.965191938769456 0.9639908347489432
0.9599855625655124 0.956607749537694 0.8798528568320709 0.9166263173618766
0.8642436168739988 0.8128181522798656 0.5936949943634674 0.6861877821589805
0.9051381444930087 0.8838165137614679 0.7144750113719518 0.7901747632275422


**FUEUW**

In [None]:
X_train_subset = X_train.loc[:, union_all]
X_val_subset = X_val.loc[:, union_all]
X_test_subset = X_test.loc[:, union_all]

In [None]:
model=LinearSVC(max_iter=10000)
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=DecisionTreeClassifier()
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=GradientBoostingClassifier()
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=LogisticRegression(max_iter=1000)
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

model=AdaBoostClassifier()
model.fit(X_train_subset,y_train)

y_pred=model.predict(X_test_subset)
print(accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred))

0.8667405019480648 0.8407365717089502 0.5760931906730218 0.6836990963501937
0.9819729841979313 0.9626831817912862 0.96531060261456 0.9639951019118345
0.9599855625655124 0.956607749537694 0.8798528568320709 0.9166263173618766
0.8642436168739988 0.8128181522798656 0.5936949943634674 0.6861877821589805
0.9051381444930087 0.8838165137614679 0.7144750113719518 0.7901747632275422
