In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

file = pd.read_csv('GDSI_OpenDataset_Final.csv', index_col=0)

features = ['report_source', 'age_in_cat', 'bmi_in_cat2', 'covid19_admission_hospital', 'covid19_confirmed_case', 'covid19_diagnosis', 'covid19_has_symptoms', 'covid19_icu_stay', 'covid19_self_isolation', 'covid19_sympt_chills', 'covid19_sympt_dry_cough', 'covid19_sympt_fatigue', 'covid19_sympt_fever', 'covid19_sympt_loss_smell_taste', 'covid19_sympt_nasal_congestion', 'covid19_sympt_pain', 'covid19_sympt_pneumonia', 'covid19_sympt_shortness_breath', 'covid19_sympt_sore_throat', 'covid19_ventilation', 'current_dmt', 'dmt_glucocorticoid', 'edss_in_cat2', 'pregnancy', 'current_or_former_smoker', 'has_comorbidities', 'com_cardiovascular_disease', 'com_chronic_kidney_disease', 'com_chronic_liver_disease', 'com_diabetes', 'com_hypertension', 'com_immunodeficiency', 'com_lung_disease', 'com_malignancy', 'com_neurological_neuromuscular', 'comorbidities_other', 'dmt_type_overall', 'covid19_outcome_levels_2', 'ms_type2', 'sex']

data = file[features]

# Yes/No kolumny - czyszczenie
binary_columns = ['covid19_admission_hospital', 'covid19_confirmed_case', 'covid19_has_symptoms', 'covid19_icu_stay', 'covid19_self_isolation', 'covid19_sympt_chills', 'covid19_sympt_dry_cough', 'covid19_sympt_fatigue', 'covid19_sympt_fever', 'covid19_sympt_loss_smell_taste', 'covid19_sympt_nasal_congestion', 'covid19_sympt_pain', 'covid19_sympt_pneumonia', 'covid19_sympt_shortness_breath', 'covid19_sympt_sore_throat', 'covid19_ventilation', 'current_dmt', 'dmt_glucocorticoid', 'pregnancy', 'current_or_former_smoker', 'has_comorbidities', 'com_cardiovascular_disease', 'com_chronic_kidney_disease', 'com_chronic_liver_disease', 'com_diabetes', 'com_hypertension', 'com_immunodeficiency', 'com_lung_disease', 'com_malignancy', 'com_neurological_neuromuscular', 'comorbidities_other']
data[binary_columns] = data[binary_columns].map(lambda x: 1 if x == 'yes' else 0)

# Pozostale kolumny - czyszczenie
ordinal_mapping = {'age_in_cat': {'0': 0, '1': 1, '2': 2, '3': 3},
                   'covid19_outcome_levels_2': {'0': 0, '1': 1, '2': 2},
                   'report_source': {'clinicians': 1, 'patients': 0},
                   'bmi_in_cat2': {'not_overweight': 0, 'overweight': 1},
                   'covid19_diagnosis': {'not_suspected': 0, 'suspected': 1, 'confirmed': 2},
                   'current_dmt': {'yes': 0, 'no': 1, 'never_treated': 2},
                   'sex': {'male': 0, 'female': 1},
                   'ms_type2': {'relapsing_remitting': 0, 'progressive_MS': 1, 'other': 2},
                   'dmt_type_overall': {'No information on DMT use': 0,
                                        'currently not using any DMT': 1,
                                        'currently on interferon': 2, 
                                        'currently on glatiramer': 3,
                                        'currently on natalizumab': 4, 
                                        'currently on fingolimod': 5,
                                        'currently on dimethyl fumarate': 6,
                                        'currently on teriflunomide': 7,
                                        'currently on alemtuzumab': 8,
                                        'currently on cladribine': 9,
                                        'currently on siponimod': 10,
                                        'currently on rituximab': 11,
                                        'currently on ocrelizumab': 12,
                                        'currently on another drug not listed': 13},
                                        'covid19_outcome_recovered': {'no': 0, 'yes': 1, 'not_applicable':2}
                   }
data.replace(ordinal_mapping, inplace=True)

# Zaminana nieznanych wartosci na zera
data.fillna(0, inplace=True)

# Zbieranie symptomow razem
symptom_columns = [col for col in data.columns if 'covid19_sympt_' in col]
comorbidity_columns = [col for col in data.columns if 'com_' in col]

data['symptom_score'] = data[symptom_columns].apply(lambda row: (row == 1).sum(), axis=1)
data['comorbidity_score'] = data[comorbidity_columns].apply(lambda row: (row == 1).sum(), axis=1)

# Odchylenie standardowe
scaler = StandardScaler()
data[['symptom_score', 'comorbidity_score']] = scaler.fit_transform(data[['symptom_score', 'comorbidity_score']])

# Przetrenowany na osobach bez chorob współistniejących 70%
X_train = data[data['has_comorbidities'] == 0].drop('has_comorbidities', axis=1)  # "normal" class
X_test = data.drop('has_comorbidities', axis=1)  # Entire dataset for prediction

from sklearn.svm import OneClassSVM

# Trenowanie 
model = OneClassSVM(gamma='auto', kernel='rbf', nu=0.1)
model.fit(X_train)
predictions = model.predict(X_test)
data['anomaly'] = predictions


# Wyświetlanie informacji o datasecie
display(data)
print(data.info())

display('Liczba wartości Null w zbiorze: ', data.isnull().sum())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[binary_columns] = data[binary_columns].map(lambda x: 1 if x == 'yes' else 0)
  data.replace(ordinal_mapping, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.replace(ordinal_mapping, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataF

Unnamed: 0_level_0,report_source,age_in_cat,bmi_in_cat2,covid19_admission_hospital,covid19_confirmed_case,covid19_diagnosis,covid19_has_symptoms,covid19_icu_stay,covid19_self_isolation,covid19_sympt_chills,...,com_malignancy,com_neurological_neuromuscular,comorbidities_other,dmt_type_overall,covid19_outcome_levels_2,ms_type2,sex,symptom_score,comorbidity_score,anomaly
secret_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C_1005,1,1,0.0,0,1,2,0,0,1,0,...,0,0,0,5.0,0,0,1,-0.571804,-0.341552,1
C_1008,1,1,0.0,0,1,2,1,0,1,0,...,0,0,0,4.0,0,0,1,-0.103778,1.656964,-1
C_1037,1,1,0.0,0,1,2,1,0,1,0,...,0,0,0,5.0,0,0,1,-0.571804,-0.341552,1
C_1039,1,1,0.0,0,0,1,1,0,1,0,...,0,0,0,13.0,0,0,1,-0.571804,-0.341552,1
C_1061,1,1,0.0,1,0,1,1,0,1,0,...,0,0,0,13.0,1,0,1,0.832274,1.656964,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P_916,0,2,0.0,0,0,0,0,0,1,0,...,0,0,0,5.0,0,2,0,-0.571804,1.656964,-1
P_919,0,2,0.0,0,0,0,0,0,0,0,...,0,0,0,1.0,0,1,0,-0.571804,-0.341552,1
P_953,0,2,0.0,0,0,0,0,0,0,0,...,0,0,0,13.0,0,0,0,-0.571804,-0.341552,1
P_954,0,2,0.0,0,0,1,1,0,0,0,...,0,0,0,1.0,0,0,0,1.768325,1.656964,-1


<class 'pandas.core.frame.DataFrame'>
Index: 1141 entries, C_1005 to P_992
Data columns (total 43 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   report_source                   1141 non-null   int64  
 1   age_in_cat                      1141 non-null   int64  
 2   bmi_in_cat2                     1141 non-null   float64
 3   covid19_admission_hospital      1141 non-null   int64  
 4   covid19_confirmed_case          1141 non-null   int64  
 5   covid19_diagnosis               1141 non-null   int64  
 6   covid19_has_symptoms            1141 non-null   int64  
 7   covid19_icu_stay                1141 non-null   int64  
 8   covid19_self_isolation          1141 non-null   int64  
 9   covid19_sympt_chills            1141 non-null   int64  
 10  covid19_sympt_dry_cough         1141 non-null   int64  
 11  covid19_sympt_fatigue           1141 non-null   int64  
 12  covid19_sympt_fever             1

'Liczba wartości Null w zbiorze: '

report_source                     0
age_in_cat                        0
bmi_in_cat2                       0
covid19_admission_hospital        0
covid19_confirmed_case            0
covid19_diagnosis                 0
covid19_has_symptoms              0
covid19_icu_stay                  0
covid19_self_isolation            0
covid19_sympt_chills              0
covid19_sympt_dry_cough           0
covid19_sympt_fatigue             0
covid19_sympt_fever               0
covid19_sympt_loss_smell_taste    0
covid19_sympt_nasal_congestion    0
covid19_sympt_pain                0
covid19_sympt_pneumonia           0
covid19_sympt_shortness_breath    0
covid19_sympt_sore_throat         0
covid19_ventilation               0
current_dmt                       0
dmt_glucocorticoid                0
edss_in_cat2                      0
pregnancy                         0
current_or_former_smoker          0
has_comorbidities                 0
com_cardiovascular_disease        0
com_chronic_kidney_disease  