In [60]:
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, ConfusionMatrixDisplay

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from collections import Counter
from xgboost import XGBClassifier as xgb

df = pd.read_csv('covid_19_data.csv')

### Encoding

In [61]:
def one_hot_encode(data, columns):
    """One-hot encodes specified columns in the dataframe."""
    return pd.get_dummies(data, columns=columns)

def label_encode(data, columns):
    """Label encodes specified columns in the dataframe using sklearn's LabelEncoder."""
    for col in columns:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
    return data

def frequency_encode(data, columns):
    """Frequency encodes specified columns in the dataframe."""
    for col in columns:
        value_counts = Counter(data[col])
        data[col] = data[col].map(lambda x: value_counts[x] / len(data))
    return data

def target_encode(data, target_column, columns):
    """Target encodes specified columns by replacing categories with the mean of the target variable."""
    for col in columns:
        # Calculate mean of the target variable for each category
        target_mean = data.groupby(col)[target_column].mean()
        # Map the mean to the original categories in the column
        data[col] = data[col].map(target_mean)
    return data

### Imputation

In [62]:
# df['AL|ATFC|Year'].ffill(inplace=True)
# df['AL|ATFC|Year'].bfill(inplace=True)

def impute_mean(data, columns):
    imputer = SimpleImputer(strategy="mean")
    data[columns] = imputer.fit_transform(data[columns])
    return data

def impute_median(data, columns):
    imputer = SimpleImputer(strategy="median")
    data[columns] = imputer.fit_transform(data[columns])
    return data

def impute_mode(data, columns):
    imputer = SimpleImputer(strategy="most_frequent")
    data[columns] = imputer.fit_transform(data[columns])
    return data

def impute_knn(data, columns, n_neighbors=5):
    imputer = KNNImputer(n_neighbors=n_neighbors)
    data[columns] = imputer.fit_transform(data[columns])
    return data

def impute_random_forest(data, columns):
    imputer = RandomForestRegressor()
    imputer.fit(data[columns[:-1]], data[columns[-1]])
    data[columns] = imputer.predict(data[columns])
    return data

def impute_regression(data, columns):
    for col in columns:
        features = [c for c in data.columns if c != col]
        model = LinearRegression()
        model.fit(data[features], data[col])
        data.loc[data[col].isnull(), col] = model.predict(data[features].loc[data[col].isnull()])
    return data


### Get Info

In [63]:
column_names_list = df.columns.tolist()
rows, cols = df.shape[0], df.shape[1]
size = rows * (cols*2)
properties = {}
print(f'{rows} rows, {cols} columns, {size} size (weighted)\n')

print('Pre-Transformation:\n')
for column in column_names_list:

    null_count = int(df[column].isna().sum())
    nonnull_count = df[column].count()
    if null_count > 0: null_percentage = round((null_count / rows) * 100, 2)
    else: null_percentage = 0

    unique = df[column].nunique()
    type = df[column].dtype

    properties[column] = {
        'type': type,
        'null_count': null_count,
        'null_percentage': null_percentage,
        'unique': unique
    }

    print(f'{column}: {type}, {null_count} null ({null_percentage}%), {unique} unique')


10671 rows, 8 columns, 170736 size (weighted)

Pre-Transformation:

SNo: int64, 0 null (0%), 10671 unique
ObservationDate: object, 0 null (0%), 70 unique
Province/State: object, 4956 null (46.44%), 293 unique
Country/Region: object, 0 null (0%), 215 unique
Last Update: object, 0 null (0%), 1814 unique
Confirmed: float64, 0 null (0%), 1541 unique
Deaths: float64, 0 null (0%), 330 unique
Recovered: float64, 0 null (0%), 745 unique


### Prep for ML

In [64]:
new_df = df.copy()
print('Post-Transformation:\n')
for column in column_names_list:
    
    # Drop columns with lots of missing values
    if properties[column]['null_percentage'] > 50: new_df = df.drop(column, axis=1)

    # Convert all numerical types to float
    if properties[column]['type'] == 'int64': df[column].astype(float)

    # Imputation
    if properties[column]['null_count'] > 0:
        if properties[column]['type'] == 'object': new_df = impute_mode(new_df, column)
        else: new_df = impute_mean(new_df, column)
    
    # Encoding
    if properties[column]['unique'] < 10: new_df = one_hot_encode(new_df, [column])

print(new_df.columns)

Post-Transformation:

Index(['SNo', 'ObservationDate', 'Province/State', 'Country/Region',
       'Last Update', 'Confirmed', 'Deaths', 'Recovered'],
      dtype='object')
