# Общая информация и цель проекта
В этом проекте предполагается работа с датасетом из открытого источника. Вам потребуется разработать модель машинного обучения, а также подготовить библиотеку и интерфейс к ней для предсказания на тестовой выборке. В начале проекта вам будет доступен обучающий датасет и тестовый датасет, не содержащий правильных ответов. Метрика на тестовом датасете будет измерена в ходе проверки работы. Также вам будет доступен скрипт, с помощью которого будет измеряться финальная метрика вашей модели.


# Задачи проекта
Предполагается, что в ходе работы над проектом будут решены следующие задачи:
* Исследование датасета (предполагается поиск дополнительной информации для лучшего понимания природы данных)
* Предобработка данных. Будьте внимательны: в данных могут быть бесполезные признаки, сильно скорелированные признаки а также косвенные утечки целевого признака. Некоторые колонки потребуется преобразовать к нужному типу данных.
*	Обучение модели
*	Подготовка предсказания на тестовой выборке. Обратите внимание. Файл предсказаний должен быть в csv формате и содержать 2 столбца: “id” и “prediction”. 
*	Подготовка скриптов и библиотеки для обработки данных и предсказания на тестовой выборке
*	Написание инструмента для тестирования
*	Оформление документации


# Описание исходных данных
Данные пациентов для предсказания риска сердечных приступов
- id  - id
-	Антропометрические параметры (вес, возраст, рост)
-	Привычки (курение, качество сна и т.д)
-	Давление
-	Наличие хронических заболеваний
-	Биохимия крови
-	Таргет - высокий или низкий риск поражения сердца


In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)

In [2]:
test = pd.read_csv(r"C:\Users\evgen\project\hearth_risk\heart_test.csv")
test.head(10)

Unnamed: 0.1,Unnamed: 0,Age,Cholesterol,Heart rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Diet,Previous Heart Problems,Medication Use,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Blood sugar,CK-MB,Troponin,Gender,Systolic blood pressure,Diastolic blood pressure,id
0,0,0.494382,0.264286,0.062328,0.0,1.0,1.0,1.0,1.0,0.361618,2,0.0,0.0,8.0,0.19437,0.587759,0.28349,0.306494,1.0,0.333333,0.227018,0.048229,0.036512,Male,0.283871,0.372093,7746
1,1,0.224719,0.953571,0.082493,1.0,0.0,0.0,1.0,0.0,0.996483,2,1.0,1.0,5.0,0.329888,0.602883,0.467036,0.087013,0.0,0.166667,0.227018,0.048229,0.036512,Female,0.703226,0.44186,4202
2,2,0.629213,0.092857,0.064161,0.0,1.0,1.0,1.0,0.0,0.995561,0,0.0,0.0,10.0,0.780075,0.370436,0.409366,0.205195,7.0,1.0,0.102767,0.002666,0.088455,Male,0.458065,0.77907,6632
3,3,0.460674,0.567857,0.055912,1.0,1.0,1.0,1.0,1.0,0.437277,0,0.0,0.0,10.0,0.785071,0.368242,0.910261,0.163636,0.0,0.666667,0.203557,0.05639,0.271774,Female,0.741935,0.255814,4639
4,4,0.719101,0.485714,0.022915,1.0,0.0,1.0,0.0,1.0,0.51492,0,0.0,0.0,7.0,0.070919,0.729578,0.758924,0.580519,5.0,0.0,0.227018,0.048229,0.036512,Male,0.412903,0.395349,4825
5,5,0.561798,0.792857,0.026581,1.0,1.0,1.0,1.0,0.0,0.77447,2,0.0,1.0,8.0,0.044114,0.813517,0.723416,0.362338,1.0,0.666667,0.227018,0.048229,0.036512,Male,0.16129,0.593023,1076
6,6,0.674157,0.853571,0.019248,1.0,1.0,1.0,0.0,1.0,0.149123,1,0.0,0.0,8.0,0.305998,0.562542,0.696128,0.211688,3.0,0.0,0.227018,0.048229,0.036512,Male,0.535484,0.546512,5020
7,7,0.539326,0.335714,0.036664,1.0,1.0,1.0,0.0,1.0,0.32972,1,1.0,0.0,1.0,0.377016,0.235091,0.484368,0.835065,7.0,0.666667,0.695652,0.002466,0.000874,Male,0.496774,0.662791,4267
8,8,0.449438,0.328571,0.07516,1.0,0.0,1.0,0.0,0.0,0.487405,2,1.0,0.0,10.0,0.089371,0.486634,0.409407,0.819481,2.0,0.166667,0.227018,0.048229,0.036512,Male,0.677419,0.430233,4003
9,9,0.617978,0.728571,0.032081,0.0,1.0,1.0,0.0,0.0,0.211485,2,1.0,1.0,8.0,0.524318,0.083936,0.657387,0.196104,0.0,1.0,0.227018,0.048229,0.036512,Male,0.722581,0.255814,2519


In [3]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 966 entries, 0 to 965
Data columns (total 27 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Unnamed: 0                       966 non-null    int64  
 1   Age                              966 non-null    float64
 2   Cholesterol                      966 non-null    float64
 3   Heart rate                       966 non-null    float64
 4   Diabetes                         935 non-null    float64
 5   Family History                   935 non-null    float64
 6   Smoking                          935 non-null    float64
 7   Obesity                          935 non-null    float64
 8   Alcohol Consumption              935 non-null    float64
 9   Exercise Hours Per Week          966 non-null    float64
 10  Diet                             966 non-null    int64  
 11  Previous Heart Problems          935 non-null    float64
 12  Medication Use        

In [4]:
test.describe()

Unnamed: 0.1,Unnamed: 0,Age,Cholesterol,Heart rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Diet,Previous Heart Problems,Medication Use,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Blood sugar,CK-MB,Troponin,Systolic blood pressure,Diastolic blood pressure,id
count,966.0,966.0,966.0,966.0,935.0,935.0,935.0,935.0,935.0,966.0,966.0,935.0,935.0,935.0,966.0,966.0,966.0,966.0,935.0,966.0,966.0,966.0,966.0,966.0,966.0,966.0
mean,482.5,0.452916,0.488181,0.051644,0.656684,0.481283,0.898396,0.516578,0.583957,0.490706,1.05383,0.494118,0.48877,5.33262,0.506715,0.484408,0.476796,0.486688,3.44385,0.512571,0.224127,0.048855,0.034926,0.447172,0.496822,4769.160455
std,279.00448,0.231068,0.282039,0.035502,0.47507,0.499917,0.302289,0.499993,0.493165,0.284902,0.884642,0.500233,0.500141,2.858466,0.287826,0.273879,0.291987,0.284534,2.302778,0.322292,0.06527,0.079742,0.053855,0.171837,0.173935,2818.815407
min,0.0,0.044944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000336,0.0,0.0,0.0,1.0,0.001187,0.003351,0.002598,0.0,0.0,0.0,0.029644,0.0,0.000194,0.012903,0.116279,0.0
25%,241.25,0.247191,0.24375,0.036664,0.0,0.0,1.0,0.0,0.0,0.243431,0.0,0.0,0.0,3.0,0.258713,0.261446,0.217527,0.249675,1.0,0.166667,0.227018,0.048229,0.036512,0.296774,0.348837,2275.75
50%,482.5,0.460674,0.49978,0.050412,1.0,0.0,1.0,1.0,1.0,0.50211,1.0,0.0,0.0,5.0,0.501327,0.492015,0.471185,0.488961,3.0,0.5,0.227018,0.048229,0.036512,0.445161,0.488372,4769.5
75%,723.75,0.640449,0.710714,0.065995,1.0,1.0,1.0,1.0,1.0,0.734425,2.0,1.0,1.0,8.0,0.759591,0.705371,0.720503,0.718182,5.0,0.833333,0.227018,0.048229,0.036512,0.6,0.639535,7206.0
max,965.0,0.853933,0.996429,1.0,1.0,1.0,1.0,1.0,1.0,0.998938,3.0,1.0,1.0,10.0,0.999033,0.998885,0.999496,0.998701,7.0,1.0,0.750988,1.0,0.970871,0.741935,0.790698,9644.0


In [6]:
test = test.drop('Unnamed: 0')
test.head(5)

KeyError: "['Unnamed: 0'] not found in axis"