In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Load the dataset
data = pd.read_csv('country_vaccinations.csv')

In [None]:
# Data Preprocessing
# Remove duplicates
data = data.drop_duplicates()

In [None]:
# Handle missing values (example: fill missing numerical values with the mean)
# You can choose to fill missing values for the selected columns here
data.fillna(data.mean(), inplace=True)

In [None]:
# Feature Selection
# Make sure to use the correct column names from your dataset
selected_features = ['feature1', 'feature2', 'categorical_feature', 'target_column']

In [None]:
# Verify if the selected columns exist in the dataset before selecting them
if all(feature in data.columns for feature in selected_features):
    data = data[selected_features]
else:
    print("Selected columns do not exist in the dataset. Please verify column names.")

In [None]:
# Encoding categorical variables (using One-Hot Encoding)
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_categorical_features = encoder.fit_transform(data[['categorical_feature']])
encoded_categorical_feature_names = encoder.get_feature_names(['categorical_feature'])

data_encoded = pd.concat([data, pd.DataFrame(encoded_categorical_features, columns=encoded_categorical_feature_names)], axis=1)
data_encoded.drop(['categorical_feature'], axis=1, inplace=True)


In [None]:
# Split data into training, validation, and test sets
train_data, test_data = train_test_split(data_encoded, test_size=0.2, random_state=42)
validation_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

In [None]:
# Scaling numerical features (example: StandardScaler)
scaler = StandardScaler()
train_data[['feature1', 'feature2']] = scaler.fit_transform(train_data[['feature1', 'feature2']])
validation_data[['feature1', 'feature2']] = scaler.transform(validation_data[['feature1', 'feature2']])
test_data[['feature1', 'feature2']] = scaler.transform(test_data[['feature1', 'feature2']])