In [69]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split 

In [70]:
# Reading raw data

df_raw = pd.read_csv("Data/heart_2022_with_nans.csv")
df_raw.head(5)

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,,No,...,,,,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
1,Alabama,Female,Excellent,0.0,0.0,,No,6.0,,No,...,1.6,68.04,26.57,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No
2,Alabama,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,,No,...,1.57,63.5,25.61,No,No,No,No,,No,Yes
3,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,,No,...,1.65,63.5,23.3,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
4,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,,No,...,1.57,53.98,21.77,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [71]:
# Checking number of nulls value in dataframe
df_raw.isnull().sum()

State                            0
Sex                              0
GeneralHealth                 1198
PhysicalHealthDays           10927
MentalHealthDays              9067
LastCheckupTime               8308
PhysicalActivities            1093
SleepHours                    5453
RemovedTeeth                 11360
HadHeartAttack                3065
HadAngina                     4405
HadStroke                     1557
HadAsthma                     1773
HadSkinCancer                 3143
HadCOPD                       2219
HadDepressiveDisorder         2812
HadKidneyDisease              1926
HadArthritis                  2633
HadDiabetes                   1087
DeafOrHardOfHearing          20647
BlindOrVisionDifficulty      21564
DifficultyConcentrating      24240
DifficultyWalking            24012
DifficultyDressingBathing    23915
DifficultyErrands            25656
SmokerStatus                 35462
ECigaretteUsage              35660
ChestScan                    56046
RaceEthnicityCategor

In [72]:
df_raw.shape

(445132, 40)

In [185]:
# Dropping null values from dataframe
df_nn = df_raw.dropna().reset_index(drop=True)

In [136]:
df_nn.shape

(246022, 40)

In [168]:
df_nn.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,0,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,0,...,1.6,71.67,27.99,0,0,1,1,"Yes, received Tdap",0,No
1,Alabama,1,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,0,...,1.78,95.25,30.13,0,0,1,1,"Yes, received tetanus shot but not sure what type",0,No
2,Alabama,1,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",0,...,1.85,108.86,31.66,1,0,0,1,"No, did not receive any tetanus shot in the pa...",0,Yes
3,Alabama,0,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,0,...,1.7,90.72,31.32,0,0,1,1,"No, did not receive any tetanus shot in the pa...",0,Yes
4,Alabama,0,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,0,...,1.55,79.38,33.07,0,0,1,1,"No, did not receive any tetanus shot in the pa...",0,No


In [75]:
df_nn.isnull().sum()

State                        0
Sex                          0
GeneralHealth                0
PhysicalHealthDays           0
MentalHealthDays             0
LastCheckupTime              0
PhysicalActivities           0
SleepHours                   0
RemovedTeeth                 0
HadHeartAttack               0
HadAngina                    0
HadStroke                    0
HadAsthma                    0
HadSkinCancer                0
HadCOPD                      0
HadDepressiveDisorder        0
HadKidneyDisease             0
HadArthritis                 0
HadDiabetes                  0
DeafOrHardOfHearing          0
BlindOrVisionDifficulty      0
DifficultyConcentrating      0
DifficultyWalking            0
DifficultyDressingBathing    0
DifficultyErrands            0
SmokerStatus                 0
ECigaretteUsage              0
ChestScan                    0
RaceEthnicityCategory        0
AgeCategory                  0
HeightInMeters               0
WeightInKilograms            0
BMI     

In [76]:
df_nn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 246022 entries, 342 to 445130
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      246022 non-null  object 
 1   Sex                        246022 non-null  object 
 2   GeneralHealth              246022 non-null  object 
 3   PhysicalHealthDays         246022 non-null  float64
 4   MentalHealthDays           246022 non-null  float64
 5   LastCheckupTime            246022 non-null  object 
 6   PhysicalActivities         246022 non-null  object 
 7   SleepHours                 246022 non-null  float64
 8   RemovedTeeth               246022 non-null  object 
 9   HadHeartAttack             246022 non-null  object 
 10  HadAngina                  246022 non-null  object 
 11  HadStroke                  246022 non-null  object 
 12  HadAsthma                  246022 non-null  object 
 13  HadSkinCancer              

In [77]:
df_nn.describe()

Unnamed: 0,PhysicalHealthDays,MentalHealthDays,SleepHours,HeightInMeters,WeightInKilograms,BMI
count,246022.0,246022.0,246022.0,246022.0,246022.0,246022.0
mean,4.119026,4.16714,7.021331,1.70515,83.615179,28.668136
std,8.405844,8.102687,1.440681,0.106654,21.323156,6.513973
min,0.0,0.0,1.0,0.91,28.12,12.02
25%,0.0,0.0,6.0,1.63,68.04,24.27
50%,0.0,0.0,7.0,1.7,81.65,27.46
75%,3.0,4.0,8.0,1.78,95.25,31.89
max,30.0,30.0,24.0,2.41,292.57,97.65


In [78]:
for col in df_nn.columns:
    print(col)
    print(df_nn[col].value_counts()) 

State
Washington              15000
Maryland                 9165
Minnesota                9161
Ohio                     8995
New York                 8923
Texas                    7408
Florida                  7315
Kansas                   6145
Wisconsin                6126
Maine                    6013
Iowa                     5672
Hawaii                   5596
Virginia                 5565
Indiana                  5502
South Carolina           5471
Massachusetts            5465
Arizona                  5462
Utah                     5373
Michigan                 5370
Colorado                 5159
Nebraska                 5107
California               5096
Connecticut              5053
Georgia                  4978
Vermont                  4845
South Dakota             4405
Montana                  4264
Missouri                 4195
New Jersey               3967
New Hampshire            3756
Puerto Rico              3589
Idaho                    3468
Alaska                   3205
Rhod

In [186]:
two_values_columns = []
for col in df_nn.columns:
    print(col)
    if len(df_nn[col].unique())==2:
        two_values_columns.append(col)

State
Sex
GeneralHealth
PhysicalHealthDays
MentalHealthDays
LastCheckupTime
PhysicalActivities
SleepHours
RemovedTeeth
HadHeartAttack
HadAngina
HadStroke
HadAsthma
HadSkinCancer
HadCOPD
HadDepressiveDisorder
HadKidneyDisease
HadArthritis
HadDiabetes
DeafOrHardOfHearing
BlindOrVisionDifficulty
DifficultyConcentrating
DifficultyWalking
DifficultyDressingBathing
DifficultyErrands
SmokerStatus
ECigaretteUsage
ChestScan
RaceEthnicityCategory
AgeCategory
HeightInMeters
WeightInKilograms
BMI
AlcoholDrinkers
HIVTesting
FluVaxLast12
PneumoVaxEver
TetanusLast10Tdap
HighRiskLastYear
CovidPos


In [138]:
for col in two_values_columns:
    print(col)
    print(df_nn[col].unique())

Sex
['Female' 'Male']
PhysicalActivities
['Yes' 'No']
HadHeartAttack
['No' 'Yes']
HadAngina
['No' 'Yes']
HadStroke
['No' 'Yes']
HadAsthma
['No' 'Yes']
HadSkinCancer
['No' 'Yes']
HadCOPD
['No' 'Yes']
HadDepressiveDisorder
['No' 'Yes']
HadKidneyDisease
['No' 'Yes']
HadArthritis
['Yes' 'No']
DeafOrHardOfHearing
['No' 'Yes']
BlindOrVisionDifficulty
['No' 'Yes']
DifficultyConcentrating
['No' 'Yes']
DifficultyWalking
['No' 'Yes']
DifficultyDressingBathing
['No' 'Yes']
DifficultyErrands
['No' 'Yes']
ChestScan
['No' 'Yes']
AlcoholDrinkers
['No' 'Yes']
HIVTesting
['No' 'Yes']
FluVaxLast12
['Yes' 'No']
PneumoVaxEver
['Yes' 'No']
HighRiskLastYear
['No' 'Yes']


In [190]:
# YES_NO_QUESTIONS = {'Yes':1,'No':0}

# df_cd['HadHeartAttack'] = df_cd['HadHeartAttack'].map(YES_NO_QUESTIONS)
for col in two_values_columns[1:]:
    df_nn.loc[df_nn[col]=='Yes',col]= 1
    df_nn.loc[df_nn[col]=='No',col]= 0

In [191]:
df_nn.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,1,9.0,None of them,0,...,1.6,71.67,27.99,0,0,1,1,"Yes, received Tdap",0,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,1,6.0,None of them,0,...,1.78,95.25,30.13,0,0,1,1,"Yes, received tetanus shot but not sure what type",0,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,0,8.0,"6 or more, but not all",0,...,1.85,108.86,31.66,1,0,0,1,"No, did not receive any tetanus shot in the pa...",0,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,1,9.0,None of them,0,...,1.7,90.72,31.32,0,0,1,1,"No, did not receive any tetanus shot in the pa...",0,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,1,5.0,1 to 5,0,...,1.55,79.38,33.07,0,0,1,1,"No, did not receive any tetanus shot in the pa...",0,No


In [192]:
df_nn.loc[df_nn['Sex']=='Male','Sex'] = 1
df_nn.loc[df_nn['Sex']=='Female','Sex']= 0

In [193]:
df_nn.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,0,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,1,9.0,None of them,0,...,1.6,71.67,27.99,0,0,1,1,"Yes, received Tdap",0,No
1,Alabama,1,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,1,6.0,None of them,0,...,1.78,95.25,30.13,0,0,1,1,"Yes, received tetanus shot but not sure what type",0,No
2,Alabama,1,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,0,8.0,"6 or more, but not all",0,...,1.85,108.86,31.66,1,0,0,1,"No, did not receive any tetanus shot in the pa...",0,Yes
3,Alabama,0,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,1,9.0,None of them,0,...,1.7,90.72,31.32,0,0,1,1,"No, did not receive any tetanus shot in the pa...",0,Yes
4,Alabama,0,Good,3.0,15.0,Within past year (anytime less than 12 months ...,1,5.0,1 to 5,0,...,1.55,79.38,33.07,0,0,1,1,"No, did not receive any tetanus shot in the pa...",0,No


In [194]:
df_nn[two_values_columns] = df_nn[two_values_columns].apply(pd.to_numeric)

In [195]:
df_nn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246022 entries, 0 to 246021
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      246022 non-null  object 
 1   Sex                        246022 non-null  int64  
 2   GeneralHealth              246022 non-null  object 
 3   PhysicalHealthDays         246022 non-null  float64
 4   MentalHealthDays           246022 non-null  float64
 5   LastCheckupTime            246022 non-null  object 
 6   PhysicalActivities         246022 non-null  int64  
 7   SleepHours                 246022 non-null  float64
 8   RemovedTeeth               246022 non-null  object 
 9   HadHeartAttack             246022 non-null  int64  
 10  HadAngina                  246022 non-null  int64  
 11  HadStroke                  246022 non-null  int64  
 12  HadAsthma                  246022 non-null  int64  
 13  HadSkinCancer              24

In [196]:
# dropping unwanted columns

df_cd = df_nn.drop(['RemovedTeeth','State', 'DifficultyWalking','DifficultyConcentrating','BlindOrVisionDifficulty','DifficultyDressingBathing','ECigaretteUsage','DeafOrHardOfHearing','HadSkinCancer','ChestScan','RaceEthnicityCategory','HIVTesting','PneumoVaxEver','HighRiskLastYear','CovidPos'],axis=1)

In [198]:
df_cd.head()

Unnamed: 0,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,HadHeartAttack,HadAngina,HadStroke,...,HadDiabetes,DifficultyErrands,SmokerStatus,AgeCategory,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,FluVaxLast12,TetanusLast10Tdap
0,0,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,1,9.0,0,0,0,...,No,0,Former smoker,Age 65 to 69,1.6,71.67,27.99,0,1,"Yes, received Tdap"
1,1,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,1,6.0,0,0,0,...,Yes,0,Former smoker,Age 70 to 74,1.78,95.25,30.13,0,1,"Yes, received tetanus shot but not sure what type"
2,1,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,0,8.0,0,0,0,...,No,0,Former smoker,Age 75 to 79,1.85,108.86,31.66,1,0,"No, did not receive any tetanus shot in the pa..."
3,0,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,1,9.0,0,0,0,...,No,0,Never smoked,Age 80 or older,1.7,90.72,31.32,0,1,"No, did not receive any tetanus shot in the pa..."
4,0,Good,3.0,15.0,Within past year (anytime less than 12 months ...,1,5.0,0,0,0,...,No,0,Never smoked,Age 80 or older,1.55,79.38,33.07,0,1,"No, did not receive any tetanus shot in the pa..."


In [199]:
df_cd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246022 entries, 0 to 246021
Data columns (total 25 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Sex                    246022 non-null  int64  
 1   GeneralHealth          246022 non-null  object 
 2   PhysicalHealthDays     246022 non-null  float64
 3   MentalHealthDays       246022 non-null  float64
 4   LastCheckupTime        246022 non-null  object 
 5   PhysicalActivities     246022 non-null  int64  
 6   SleepHours             246022 non-null  float64
 7   HadHeartAttack         246022 non-null  int64  
 8   HadAngina              246022 non-null  int64  
 9   HadStroke              246022 non-null  int64  
 10  HadAsthma              246022 non-null  int64  
 11  HadCOPD                246022 non-null  int64  
 12  HadDepressiveDisorder  246022 non-null  int64  
 13  HadKidneyDisease       246022 non-null  int64  
 14  HadArthritis           246022 non-nu

In [200]:
categorical_columns = df_cd.select_dtypes(include=['object']).columns.tolist()

In [201]:
categorical_columns

['GeneralHealth',
 'LastCheckupTime',
 'HadDiabetes',
 'SmokerStatus',
 'AgeCategory',
 'TetanusLast10Tdap']

In [202]:
from sklearn.preprocessing import OneHotEncoder

In [203]:
encoder = OneHotEncoder(handle_unknown = 'ignore')

In [204]:
one_hot_encoded = encoder.fit_transform(df_cd[categorical_columns]).toarray()

In [205]:
encoder.get_feature_names_out(categorical_columns)

array(['GeneralHealth_Excellent', 'GeneralHealth_Fair',
       'GeneralHealth_Good', 'GeneralHealth_Poor',
       'GeneralHealth_Very good', 'LastCheckupTime_5 or more years ago',
       'LastCheckupTime_Within past 2 years (1 year but less than 2 years ago)',
       'LastCheckupTime_Within past 5 years (2 years but less than 5 years ago)',
       'LastCheckupTime_Within past year (anytime less than 12 months ago)',
       'HadDiabetes_No',
       'HadDiabetes_No, pre-diabetes or borderline diabetes',
       'HadDiabetes_Yes',
       'HadDiabetes_Yes, but only during pregnancy (female)',
       'SmokerStatus_Current smoker - now smokes every day',
       'SmokerStatus_Current smoker - now smokes some days',
       'SmokerStatus_Former smoker', 'SmokerStatus_Never smoked',
       'AgeCategory_Age 18 to 24', 'AgeCategory_Age 25 to 29',
       'AgeCategory_Age 30 to 34', 'AgeCategory_Age 35 to 39',
       'AgeCategory_Age 40 to 44', 'AgeCategory_Age 45 to 49',
       'AgeCategory_Age 50 t

In [206]:
one_hot_df = pd.DataFrame(one_hot_encoded,columns=encoder.get_feature_names_out(categorical_columns))

In [207]:
one_hot_df.shape

(246022, 34)

In [208]:
for i in one_hot_df.columns:
    if i in df_cd.columns:
        print(i)

In [209]:
df_cd.drop(categorical_columns, axis=1).shape

(246022, 19)

In [210]:
df_encoded = pd.concat([df_cd.drop(categorical_columns, axis=1).reset_index(drop = True),one_hot_df], axis=1)

In [211]:
df_encoded.shape

(246022, 53)

In [212]:
df_encoded.head()

Unnamed: 0,Sex,PhysicalHealthDays,MentalHealthDays,PhysicalActivities,SleepHours,HadHeartAttack,HadAngina,HadStroke,HadAsthma,HadCOPD,...,AgeCategory_Age 55 to 59,AgeCategory_Age 60 to 64,AgeCategory_Age 65 to 69,AgeCategory_Age 70 to 74,AgeCategory_Age 75 to 79,AgeCategory_Age 80 or older,"TetanusLast10Tdap_No, did not receive any tetanus shot in the past 10 years","TetanusLast10Tdap_Yes, received Tdap","TetanusLast10Tdap_Yes, received tetanus shot but not sure what type","TetanusLast10Tdap_Yes, received tetanus shot, but not Tdap"
0,0,4.0,0.0,1,9.0,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1,0.0,0.0,1,6.0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,0.0,0.0,0,8.0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0,5.0,0.0,1,9.0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0,3.0,15.0,1,5.0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [213]:
df_encoded.to_csv('Data\cleaned_data1.csv',index=False)

In [214]:
df = df_encoded.sample(frac = 0.1)

In [215]:
df.head()

Unnamed: 0,Sex,PhysicalHealthDays,MentalHealthDays,PhysicalActivities,SleepHours,HadHeartAttack,HadAngina,HadStroke,HadAsthma,HadCOPD,...,AgeCategory_Age 55 to 59,AgeCategory_Age 60 to 64,AgeCategory_Age 65 to 69,AgeCategory_Age 70 to 74,AgeCategory_Age 75 to 79,AgeCategory_Age 80 or older,"TetanusLast10Tdap_No, did not receive any tetanus shot in the past 10 years","TetanusLast10Tdap_Yes, received Tdap","TetanusLast10Tdap_Yes, received tetanus shot but not sure what type","TetanusLast10Tdap_Yes, received tetanus shot, but not Tdap"
24559,1,14.0,5.0,1,8.0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
208996,0,1.0,0.0,0,8.0,0,0,0,1,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
187878,0,0.0,15.0,1,6.0,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
58954,1,5.0,0.0,1,6.0,0,0,0,0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
54675,0,2.0,15.0,1,6.0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [216]:
X = df.drop(['HadHeartAttack'],axis=1)

In [217]:
X.shape

(24602, 52)

In [218]:
y = df['HadHeartAttack']

In [219]:
X_train, X_test, y_train, y_test = train_test_split(X,y , 
                                   random_state=98,  
                                   test_size=0.20,  
                                   shuffle=True) 

In [220]:
from sklearn.svm import SVC


In [221]:
X_train.head()

Unnamed: 0,Sex,PhysicalHealthDays,MentalHealthDays,PhysicalActivities,SleepHours,HadAngina,HadStroke,HadAsthma,HadCOPD,HadDepressiveDisorder,...,AgeCategory_Age 55 to 59,AgeCategory_Age 60 to 64,AgeCategory_Age 65 to 69,AgeCategory_Age 70 to 74,AgeCategory_Age 75 to 79,AgeCategory_Age 80 or older,"TetanusLast10Tdap_No, did not receive any tetanus shot in the past 10 years","TetanusLast10Tdap_Yes, received Tdap","TetanusLast10Tdap_Yes, received tetanus shot but not sure what type","TetanusLast10Tdap_Yes, received tetanus shot, but not Tdap"
147551,1,4.0,1.0,1,7.0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
73570,1,30.0,15.0,1,5.0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
97371,0,0.0,2.0,1,7.0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
214118,0,0.0,0.0,1,7.0,0,0,0,0,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
165976,0,14.0,0.0,1,8.0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [222]:
len(X_test)

4921

In [223]:
model = SVC(kernel="rbf", gamma=0.5, C=1.0)

In [224]:
print(model)

SVC(gamma=0.5)


In [225]:
model.fit(X_train,y_train)

In [226]:
model.score(X_test,y_test)

0.9435074171916277

In [227]:
y_pred = model.predict(X_test)

In [228]:
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
metrics.accuracy_score(y_test, y_pred)

0.9435074171916277

In [229]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [230]:
from sklearn.metrics import confusion_matrix

In [231]:
cm = confusion_matrix(y_test,y_pred)

In [232]:
cm

array([[4643,    0],
       [ 278,    0]], dtype=int64)

In [236]:
input = """
1	5	0	5	0	0	0	0	0	1	0	1.88	115.67	32.74	0	1	0	0	1	0	0	0	0	0	1	0	1	1	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	1	0
"""

lst = input.split()

In [237]:
lstint = [float(x) for x in lst]

In [238]:
 p = model.predict(np.reshape(lstint,(1,-1)))



In [239]:
p

array([0], dtype=int64)