In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [19]:
pip install -U scikit-learn




In [20]:
data = pd.read_csv('training_set_features.csv')
data_2 = pd.read_csv('training_set_labels.csv')
test = pd.read_csv('test_set_features.csv')

In [21]:
resident_id = data['respondent_id']
xyz_concern = data['xyz_concern']

In [22]:
print(data.head())
print(data.info())
print(data.describe())
print(data_2.head())
print(data_2.info())
print(data_2.describe())

   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [23]:
data.fillna(0, inplace=True)

In [24]:
y_xyz = data_2['xyz_vaccine']
y_seasonal = data_2['seasonal_vaccine']

In [25]:
X = data.drop(['age_group', 'education', 'race', 'sex', 'employment_industry', 'employment_occupation'], axis=1)

In [26]:
def convert_to_numeric(income_poverty):
  
  try:
    if isinstance(race.iloc[0], str):
      for i in range(len(income_poverty)):
        try:
          numeric_part = float(col.iloc[i].split()[0].strip("<"))  
          income_poverty.iloc[i] = numeric_part
        except ValueError:
          pass  

    return pd.to_numeric(income_poverty, errors='coerce')  
  except:
    return None  
X['income_poverty'] = convert_to_numeric(X['income_poverty'])

# print(X)


In [27]:

def convert_to_numeric(marital_status):
  
  try:
    if isinstance(marital_status.iloc[0], str):
      for i in range(len(marital_status)):
        try:
          numeric_part = float(col.iloc[i].split()[0].strip("<"))
          marital_status.iloc[i] = numeric_part
        except ValueError:
          pass  

    return pd.to_numeric(marital_status, errors='coerce')  
  except:
    return None 
X['marital_status'] = convert_to_numeric(X['marital_status'])

# print(X)

In [28]:
def convert_to_numeric(rent_or_own):
 
  try:
    if isinstance(rent_or_own.iloc[0], str):
      for i in range(len(rent_or_own)):
        try:
          numeric_part = float(col.iloc[i].split()[0].strip("<"))  
          rent_or_own.iloc[i] = numeric_part
        except ValueError:
          pass  

    return pd.to_numeric(rent_or_own, errors='coerce')  
  except:
    return None  


X['rent_or_own'] = convert_to_numeric(X['rent_or_own'])

In [29]:
def convert_to_numeric(employment_status):
 
  try:
    
    if isinstance(employment_status.iloc[0], str):
      for i in range(len(employment_status)):
        try:
          numeric_part = float(col.iloc[i].split()[0].strip("<"))  
          employment_status.iloc[i] = numeric_part
        except ValueError:
          pass  

    return pd.to_numeric(employment_status, errors='coerce')  
  except:
    return None  


X['employment_status'] = convert_to_numeric(X['employment_status'])
# print(X)

In [30]:
def convert_to_numeric(hhs_geo_region):
  try:
    
    if isinstance(hhs_geo_region.iloc[0], str):
      for i in range(len(hhs_geo_region)):
        try:
          numeric_part = float(col.iloc[i].split()[0].strip("<"))  
          hhs_geo_region.iloc[i] = numeric_part
        except ValueError:
          pass  

    return pd.to_numeric(hhs_geo_region, errors='coerce') 
  except:
    return None  

X['hhs_geo_region'] = convert_to_numeric(X['hhs_geo_region'])

In [31]:
def convert_to_numeric(census_msa):
 
  try:
   
    if isinstance(census_msa.iloc[0], str):
      for i in range(len(census_msa)):
        try:
          numeric_part = float(col.iloc[i].split()[0].strip("<"))  
          census_msa.iloc[i] = numeric_part
        except ValueError:
          pass  

    return pd.to_numeric(census_msa, errors='coerce')  
  except:
    return None  


X['census_msa'] = convert_to_numeric(X['census_msa'])

In [32]:

X_train, X_test, y_xyz_train, y_xyz_test = train_test_split(X, y_xyz, test_size=0.2, random_state=42)
X_train, X_test, y_seasonal_train, y_seasonal_test = train_test_split(X, y_seasonal, test_size=0.2, random_state=42)

In [33]:
print(X)

       respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0                  0          1.0            0.0                        0.0   
1                  1          3.0            2.0                        0.0   
2                  2          1.0            1.0                        0.0   
3                  3          1.0            1.0                        0.0   
4                  4          2.0            1.0                        0.0   
...              ...          ...            ...                        ...   
26702          26702          2.0            0.0                        0.0   
26703          26703          1.0            2.0                        0.0   
26704          26704          2.0            2.0                        0.0   
26705          26705          1.0            1.0                        0.0   
26706          26706          0.0            0.0                        0.0   

       behavioral_avoidance  behavioral_face_mask  

In [34]:
#training of xyz vaccine
model_xyz = RandomForestClassifier(random_state=42)
model_xyz.fit(X_train, y_xyz_train)
#training of seasonal vaccine 
model_seasonal = RandomForestClassifier(random_state=42)
model_seasonal.fit(X_train, y_seasonal_train)
#evaluation
y_xyz_pred = model_xyz.predict(X_test)
y_seasonal_pred = model_seasonal.predict(X_test)
#Xyz and seasonal vaccine prediction
print("XYZ vaccine prediction accuracy: ", accuracy_score(y_xyz_test, y_xyz_pred))
print("Seasonal vaccine prediction accuracy: ", accuracy_score(y_seasonal_test, y_seasonal_pred))
#Classification 
print("XYZ vaccine classification report:\n", classification_report(y_xyz_test, y_xyz_pred))
print("Seasonal vaccine classification report:\n", classification_report(y_seasonal_test, y_seasonal_pred))
#confusion matrix
print("XYZ vaccine confusion matrix:\n", confusion_matrix(y_xyz_test, y_xyz_pred))
print("Seasonal vaccine confusion matrix:\n", confusion_matrix(y_seasonal_test, y_seasonal_pred))

XYZ vaccine prediction accuracy:  0.8457506551853239
Seasonal vaccine prediction accuracy:  0.7635716959940098
XYZ vaccine classification report:
               precision    recall  f1-score   support

           0       0.86      0.95      0.91      4212
           1       0.72      0.44      0.55      1130

    accuracy                           0.85      5342
   macro avg       0.79      0.70      0.73      5342
weighted avg       0.83      0.85      0.83      5342

Seasonal vaccine classification report:
               precision    recall  f1-score   support

           0       0.77      0.79      0.78      2891
           1       0.75      0.73      0.74      2451

    accuracy                           0.76      5342
   macro avg       0.76      0.76      0.76      5342
weighted avg       0.76      0.76      0.76      5342

XYZ vaccine confusion matrix:
 [[4021  191]
 [ 633  497]]
Seasonal vaccine confusion matrix:
 [[2297  594]
 [ 669 1782]]
