In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [4]:
def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None,
              encoder_kwargs=None):

    if copy:
        input_dataframe = input_dataframe.copy()

    if drop_nans:
        input_dataframe.dropna(inplace=True)

    if encoder_kwargs is None:
        encoder_kwargs = {}

    for column in input_dataframe.columns.values:
        try:
            input_dataframe[column].fillna(input_dataframe[column].median(), inplace=True)
        except TypeError:
            most_frequent = input_dataframe[column].mode()
            if len(most_frequent) > 0:
                input_dataframe[column].fillna(input_dataframe[column].mode()[0], inplace=True)
            else:
                input_dataframe[column].fillna(method='bfill', inplace=True)
                input_dataframe[column].fillna(method='ffill', inplace=True)


        if str(input_dataframe[column].values.dtype) == 'object':
            if encoder is not None:
                column_encoder = encoder(**encoder_kwargs).fit(input_dataframe[column].values)
            else:
                column_encoder = LabelEncoder().fit(input_dataframe[column].values)

            input_dataframe[column] = column_encoder.transform(input_dataframe[column].values)

    return input_dataframe


In [5]:
def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=False,
                 encoder=None, encoder_kwargs=None):
    
    if set(training_dataframe.columns.values) != set(testing_dataframe.columns.values):
        raise ValueError('The training and testing DataFrames do not have the same columns. '
                         'Make sure that you are providing the same columns.')

    if copy:
        training_dataframe = training_dataframe.copy()
        testing_dataframe = testing_dataframe.copy()
    
    if drop_nans:
        training_dataframe.dropna(inplace=True)
        testing_dataframe.dropna(inplace=True)

    if encoder_kwargs is None:
        encoder_kwargs = {}

    for column in training_dataframe.columns.values:
        try:
            column_median = training_dataframe[column].median()
            training_dataframe[column].fillna(column_median, inplace=True)
            testing_dataframe[column].fillna(column_median, inplace=True)
        except TypeError:
            column_mode = training_dataframe[column].mode()[0]
            training_dataframe[column].fillna(column_mode, inplace=True)
            testing_dataframe[column].fillna(column_mode, inplace=True)

        if str(training_dataframe[column].values.dtype) == 'object':
            if encoder is not None:
                column_encoder = encoder(**encoder_kwargs).fit(training_dataframe[column].values)
            else:
                column_encoder = LabelEncoder().fit(training_dataframe[column].values)

            training_dataframe[column] = column_encoder.transform(training_dataframe[column].values)
            testing_dataframe[column] = column_encoder.transform(testing_dataframe[column].values)

    return training_dataframe, testing_dataframe

In [6]:
df = pd.read_csv('datasets\owid-covid-data.csv')
df.head()


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-01-03,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
1,AFG,Asia,Afghanistan,2020-01-04,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
2,AFG,Asia,Afghanistan,2020-01-05,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
3,AFG,Asia,Afghanistan,2020-01-06,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
4,AFG,Asia,Afghanistan,2020-01-07,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,


In [7]:
df.isna().sum()

iso_code                                        0
continent                                   16376
location                                        0
date                                            0
total_cases                                 37815
                                            ...  
population                                      0
excess_mortality_cumulative_absolute       332835
excess_mortality_cumulative                332835
excess_mortality                           332835
excess_mortality_cumulative_per_million    332835
Length: 67, dtype: int64

In [8]:
cleaned_dataframe = autoclean(df)

print(cleaned_dataframe.head())

   iso_code  continent  location  date  total_cases  new_cases  \
0         1          1         0     2      68890.0        0.0   
1         1          1         0     3      68890.0        0.0   
2         1          1         0     4      68890.0        0.0   
3         1          1         0     5      68890.0        0.0   
4         1          1         0     6      68890.0        0.0   

   new_cases_smoothed  total_deaths  new_deaths  new_deaths_smoothed  ...  \
0                26.0        1302.0         0.0                0.143  ...   
1                26.0        1302.0         0.0                0.143  ...   
2                26.0        1302.0         0.0                0.143  ...   
3                26.0        1302.0         0.0                0.143  ...   
4                26.0        1302.0         0.0                0.143  ...   

   male_smokers  handwashing_facilities  hospital_beds_per_thousand  \
0          33.1                  37.746                         0.5  

In [9]:
df.isna().sum()

iso_code                                   0
continent                                  0
location                                   0
date                                       0
total_cases                                0
                                          ..
population                                 0
excess_mortality_cumulative_absolute       0
excess_mortality_cumulative                0
excess_mortality                           0
excess_mortality_cumulative_per_million    0
Length: 67, dtype: int64

In [10]:
df1 = pd.read_csv('datasets\owid-covid-data.csv')
df1.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-01-03,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
1,AFG,Asia,Afghanistan,2020-01-04,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
2,AFG,Asia,Afghanistan,2020-01-05,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
3,AFG,Asia,Afghanistan,2020-01-06,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
4,AFG,Asia,Afghanistan,2020-01-07,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
