In [35]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as pt
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [36]:
# creating dataframe
df = pd.read_csv("../csv/raw_test_data.csv")
df.head(1)

Unnamed: 0,Sno,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Confirmed,OxygenTanks,PopulationDensityPerSqKm
0,15001,2021-05-17,8:00 AM,Meghalaya,-,-,18478,23332,191,514


In [37]:
# changing values of column names to lowercase
cols = df.columns
lower_cols = []
for i in range(len(cols)):
    lower_cols.append(cols[i].lower())
df.columns = lower_cols
df.head(1)

Unnamed: 0,sno,date,time,state/unionterritory,confirmedindiannational,confirmedforeignnational,cured,confirmed,oxygentanks,populationdensitypersqkm
0,15001,2021-05-17,8:00 AM,Meghalaya,-,-,18478,23332,191,514


In [38]:
# processing date
df["date"] = pd.to_datetime(df["date"])
df["day"] = df["date"].dt.day
df["month"] = df["date"].dt.month
df["year"] = df["date"].dt.year
df["week"] = df["date"].dt.isocalendar().week
df["weekday_number"] = df["date"].dt.weekday

# processing time
def process_time(df):
    total = df["time"].split(" ")
    time = total[0]
    am_pm = total[1]

    df["hour"] = time.split(":")[0]
    df["minute"] = time.split(":")[1]
    df["am_pm"] = 0 if am_pm == "AM" else 1

    return df

df = df.apply(process_time, axis=1)

df.head(1)

Unnamed: 0,sno,date,time,state/unionterritory,confirmedindiannational,confirmedforeignnational,cured,confirmed,oxygentanks,populationdensitypersqkm,day,month,year,week,weekday_number,hour,minute,am_pm
0,15001,2021-05-17,8:00 AM,Meghalaya,-,-,18478,23332,191,514,17,5,2021,20,0,8,0,0


In [39]:
# dropping unwanted column
df.drop(columns=["sno", "date", "time", "confirmedindiannational", "confirmedforeignnational"], axis=1, inplace=True)
df.head(1)

Unnamed: 0,state/unionterritory,cured,confirmed,oxygentanks,populationdensitypersqkm,day,month,year,week,weekday_number,hour,minute,am_pm
0,Meghalaya,18478,23332,191,514,17,5,2021,20,0,8,0,0


In [40]:
# Label envode state
le = LabelEncoder()
df["state/unionterritory"] = le.fit_transform(df["state/unionterritory"])
df.head(1)

Unnamed: 0,state/unionterritory,cured,confirmed,oxygentanks,populationdensitypersqkm,day,month,year,week,weekday_number,hour,minute,am_pm
0,28,18478,23332,191,514,17,5,2021,20,0,8,0,0


In [41]:
# standarization
ss = StandardScaler()
cols = ["cured", "oxygentanks", "confirmed", "populationdensitypersqkm"]

for i in cols:
    df[i] = ss.fit_transform(df[[i]])

df.head(3)

Unnamed: 0,state/unionterritory,cured,confirmed,oxygentanks,populationdensitypersqkm,day,month,year,week,weekday_number,hour,minute,am_pm
0,28,-0.68689,-0.683871,-1.64699,0.051243,17,5,2021,20,0,8,0,0
1,29,-0.697378,-0.696178,-1.32839,0.297784,17,5,2021,20,0,8,0,0
2,30,-0.69108,-0.688331,1.713893,0.368225,17,5,2021,20,0,8,0,0


In [42]:
# export as preprocessed data
df.to_csv("../csv/preprocessed_test_data.csv", index=False, mode="w")