In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

In [15]:
raw_data_path = "../data/raw/DATA.csv"
df = pd.read_csv(raw_data_path)
df.head()

Unnamed: 0,Patient_ID,Systemic Illness,Rectal Pain,Sore Throat,Penile Oedema,Oral Lesions,Solitary Lesion,Swollen Tonsils,HIV Infection,Sexually Transmitted Infection,MonkeyPox
0,P0,,False,True,True,True,False,True,False,False,Negative
1,P1,Fever,True,False,True,True,False,False,True,False,Positive
2,P2,Fever,False,True,True,False,False,False,True,False,Positive
3,P3,,True,False,False,False,True,True,True,False,Positive
4,P4,Swollen Lymph Nodes,True,True,True,False,False,True,True,False,Positive


In [16]:
df.info

<bound method DataFrame.info of       Patient_ID     Systemic Illness  Rectal Pain  Sore Throat  \
0             P0                 None        False         True   
1             P1                Fever         True        False   
2             P2                Fever        False         True   
3             P3                 None         True        False   
4             P4  Swollen Lymph Nodes         True         True   
...          ...                  ...          ...          ...   
24995     P24995                 None         True         True   
24996     P24996                Fever        False         True   
24997     P24997                 None         True         True   
24998     P24998  Swollen Lymph Nodes        False         True   
24999     P24999  Swollen Lymph Nodes        False        False   

       Penile Oedema  Oral Lesions  Solitary Lesion  Swollen Tonsils  \
0               True          True            False             True   
1               Tru

In [17]:
df.describe()

Unnamed: 0,Patient_ID,Systemic Illness,Rectal Pain,Sore Throat,Penile Oedema,Oral Lesions,Solitary Lesion,Swollen Tonsils,HIV Infection,Sexually Transmitted Infection,MonkeyPox
count,25000,25000,25000,25000,25000,25000,25000,25000,25000,25000,25000
unique,25000,4,2,2,2,2,2,2,2,2,2
top,P6706,Fever,False,True,True,False,True,True,True,False,Positive
freq,1,6382,12655,12554,12612,12514,12527,12533,12584,12554,15909


In [18]:
df.isna().sum()

Patient_ID                        0
Systemic Illness                  0
Rectal Pain                       0
Sore Throat                       0
Penile Oedema                     0
Oral Lesions                      0
Solitary Lesion                   0
Swollen Tonsils                   0
HIV Infection                     0
Sexually Transmitted Infection    0
MonkeyPox                         0
dtype: int64

In [19]:
df["MonkeyPox"].value_counts()

Positive    15909
Negative     9091
Name: MonkeyPox, dtype: int64

In [20]:
for column in df.columns:
    print(f"{column}: {np.unique(df[column])}")

Patient_ID: ['P0' 'P1' 'P10' ... 'P9997' 'P9998' 'P9999']
Systemic Illness: ['Fever' 'Muscle Aches and Pain' 'None' 'Swollen Lymph Nodes']
Rectal Pain: [False  True]
Sore Throat: [False  True]
Penile Oedema: [False  True]
Oral Lesions: [False  True]
Solitary Lesion: [False  True]
Swollen Tonsils: [False  True]
HIV Infection: [False  True]
Sexually Transmitted Infection: [False  True]
MonkeyPox: ['Negative' 'Positive']


In [21]:
df.columns[1:]

Index(['Systemic Illness', 'Rectal Pain', 'Sore Throat', 'Penile Oedema',
       'Oral Lesions', 'Solitary Lesion', 'Swollen Tonsils', 'HIV Infection',
       'Sexually Transmitted Infection', 'MonkeyPox'],
      dtype='object')

In [24]:
need_proccessing_columns = df.columns[1:]
for column in need_proccessing_columns:
    list_uniq_column = np.unique(df[column])
    df[column].replace(list_uniq_column, [*range(len(list_uniq_column))], inplace=True)

In [25]:
for column in df.columns:
    print(f"{column}: {np.unique(df[column])}")

Patient_ID: ['P0' 'P1' 'P10' ... 'P9997' 'P9998' 'P9999']
Systemic Illness: [0 1 2 3]
Rectal Pain: [0 1]
Sore Throat: [0 1]
Penile Oedema: [0 1]
Oral Lesions: [0 1]
Solitary Lesion: [0 1]
Swollen Tonsils: [0 1]
HIV Infection: [0 1]
Sexually Transmitted Infection: [0 1]
MonkeyPox: [0 1]


In [26]:
df.drop(["Patient_ID"], axis=1, inplace=True)

In [27]:
df.head()

Unnamed: 0,Systemic Illness,Rectal Pain,Sore Throat,Penile Oedema,Oral Lesions,Solitary Lesion,Swollen Tonsils,HIV Infection,Sexually Transmitted Infection,MonkeyPox
0,2,0,1,1,1,0,1,0,0,0
1,0,1,0,1,1,0,0,1,0,1
2,0,0,1,1,0,0,0,1,0,1
3,2,1,0,0,0,1,1,1,0,1
4,3,1,1,1,0,0,1,1,0,1


In [28]:
X = df.drop(['MonkeyPox'],axis = 1)
y = df['MonkeyPox']

In [29]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

transformer = ColumnTransformer([
    ('One Hot', OneHotEncoder(), column)
    ])

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb

logreg = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
xgb = XGBClassifier()
lgbm = lgb.LGBMClassifier()

XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/romansim/Проекты/Технопарк/ML-разработчик/2 семестр/Машинное обучение в продакшен/homeworks/venv/lib/python3.6/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: '/usr/local/opt/libomp/lib/libomp.dylib'\n  Referenced from: '/Users/romansim/Проекты/Технопарк/ML-разработчик/2 семестр/Машинное обучение в продакшен/homeworks/venv/lib/python3.6/site-packages/xgboost/lib/libxgboost.dylib'\n  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/local/lib/libomp.dylib' (no such file), '/usr/lib/libomp.dylib' (no such file)"]
