In [1]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as pt
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
# creating dataframe
df = pd.read_csv("raw_train_data.csv")
df.head(1)

Unnamed: 0,Sno,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed,OxygenTanks,PopulationDensityPerSqKm
0,1,2020-01-30,6:00 PM,Kerala,1,0,0,0,1,8262,867


In [3]:
# changing values of column names to lowercase
cols = df.columns
lower_cols = []
for i in range(len(cols)):
    lower_cols.append(cols[i].lower())
df.columns = lower_cols
df.head(1)

Unnamed: 0,sno,date,time,state/unionterritory,confirmedindiannational,confirmedforeignnational,cured,deaths,confirmed,oxygentanks,populationdensitypersqkm
0,1,2020-01-30,6:00 PM,Kerala,1,0,0,0,1,8262,867


In [4]:
# find unique values to find if there is any null value
for i in df.columns:
    if "-" in df[i].unique():
        print(f"{i}")

confirmedindiannational
confirmedforeignnational


In [5]:
# handle missing values
# Replace '-' with NaN
df["confirmedindiannational"] = df["confirmedindiannational"].replace("-", np.NaN)
df["confirmedforeignnational"] = df["confirmedforeignnational"].replace("-", np.NaN)

# Convert column to numeric
df["confirmedindiannational"] = pd.to_numeric(df["confirmedindiannational"])
df["confirmedforeignnational"] = pd.to_numeric(df["confirmedforeignnational"])

# Fill missing values with mean
df["confirmedindiannational"] = df["confirmedindiannational"].fillna(df["confirmedindiannational"].mean())
df["confirmedforeignnational"] = df["confirmedforeignnational"].fillna(df["confirmedforeignnational"].mean())

# find unique values to check further missing values
for i in df.columns:
    if "-" in df[i].unique():
        print(f"{i}")

In [6]:
# processing date
df["date"] = pd.to_datetime(df["date"])
df["day"] = df["date"].dt.day
df["month"] = df["date"].dt.month
df["year"] = df["date"].dt.year
df["week"] = df["date"].dt.isocalendar().week
df["weekday_number"] = df["date"].dt.weekday

# processing time
def process_time(df):
    total = df["time"].split(" ")
    time = total[0]
    am_pm = total[1]

    df["hour"] = time.split(":")[0]
    df["minute"] = time.split(":")[1]
    df["am_pm"] = 0 if am_pm == "AM" else 1

    return df

df = df.apply(process_time, axis=1)

df.head(1)

Unnamed: 0,sno,date,time,state/unionterritory,confirmedindiannational,confirmedforeignnational,cured,deaths,confirmed,oxygentanks,populationdensitypersqkm,day,month,year,week,weekday_number,hour,minute,am_pm
0,1,2020-01-30,6:00 PM,Kerala,1.0,0.0,0,0,1,8262,867,30,1,2020,5,3,6,0,1


In [7]:
# dropping unwanted column
df.drop(columns=["sno", "date", "time"], axis=1, inplace=True)
df.head(1)

Unnamed: 0,state/unionterritory,confirmedindiannational,confirmedforeignnational,cured,deaths,confirmed,oxygentanks,populationdensitypersqkm,day,month,year,week,weekday_number,hour,minute,am_pm
0,Kerala,1.0,0.0,0,0,1,8262,867,30,1,2020,5,3,6,0,1


In [8]:
# Label envode state
le = LabelEncoder()
df["state/unionterritory"] = le.fit_transform(df["state/unionterritory"])
df.head(1)

Unnamed: 0,state/unionterritory,confirmedindiannational,confirmedforeignnational,cured,deaths,confirmed,oxygentanks,populationdensitypersqkm,day,month,year,week,weekday_number,hour,minute,am_pm
0,18,1.0,0.0,0,0,1,8262,867,30,1,2020,5,3,6,0,1


In [9]:
# standarization
ss = StandardScaler()
cols = ["confirmedindiannational", "confirmedforeignnational", "cured", "deaths", "oxygentanks", "confirmed"]

for i in cols:
    df[i] = ss.fit_transform(df[[i]])

df.head(3)

Unnamed: 0,state/unionterritory,confirmedindiannational,confirmedforeignnational,cured,deaths,confirmed,oxygentanks,populationdensitypersqkm,day,month,year,week,weekday_number,hour,minute,am_pm
0,18,-3.009879,-2.427941,-0.480563,-0.378187,-0.475776,1.121687,867,30,1,2020,5,3,6,0,1
1,18,-3.009879,-2.427941,-0.480563,-0.378187,-0.475776,1.423894,61,31,1,2020,5,4,6,0,1
2,18,-2.74086,-2.427941,-0.480563,-0.378187,-0.475774,-0.412169,316,1,2,2020,5,5,6,0,1


In [10]:
# remove outlier after standardization
cols = ["confirmedindiannational", "confirmedforeignnational", "cured", "deaths", "oxygentanks", "confirmed"]
for i in cols:
    df = df[(df[i] < 3) & (df[i] > -3)]

In [11]:
# export as preprocessed data
df.to_csv("preprocessed_train_data.csv", index=False, mode="w")