In [51]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as pt
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [52]:
# creating dataframe
df = pd.read_csv("../csv/raw_train_data.csv")
df.head(1)

Unnamed: 0,Sno,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed,OxygenTanks,PopulationDensityPerSqKm
0,1,2020-01-30,6:00 PM,Kerala,1,0,0,0,1,8262,867


In [53]:
# changing values of column names to lowercase
cols = df.columns
lower_cols = []
for i in range(len(cols)):
    lower_cols.append(cols[i].lower())
df.columns = lower_cols
df.head(1)

Unnamed: 0,sno,date,time,state/unionterritory,confirmedindiannational,confirmedforeignnational,cured,deaths,confirmed,oxygentanks,populationdensitypersqkm
0,1,2020-01-30,6:00 PM,Kerala,1,0,0,0,1,8262,867


In [54]:
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by='date')

# Initialize columns for non-cured and non-deaths cases
df['non_cured'] = 0

# Calculate non-cured and non-deaths cases till the previous day
for index, row in df.iterrows():
    prev_day_data = df[(df['date'] < row['date']) & (df['state/unionterritory'] == row['state/unionterritory'])].tail(1)
    if not prev_day_data.empty:
        df.at[index, 'non_cured'] = row['confirmed'] - prev_day_data.iloc[0]['cured']

In [55]:
# processing date
df["date"] = pd.to_datetime(df["date"])
df["day"] = df["date"].dt.day
df["month"] = df["date"].dt.month
df["year"] = df["date"].dt.year
df["week"] = df["date"].dt.isocalendar().week
df["weekday_number"] = df["date"].dt.weekday

# processing time
def process_time(df):
    total = df["time"].split(" ")
    time = total[0]
    am_pm = total[1]

    df["hour"] = time.split(":")[0]
    df["minute"] = time.split(":")[1]
    df["am_pm"] = 0 if am_pm == "AM" else 1

    return df

df = df.apply(process_time, axis=1)

df.head(1)

Unnamed: 0,sno,date,time,state/unionterritory,confirmedindiannational,confirmedforeignnational,cured,deaths,confirmed,oxygentanks,populationdensitypersqkm,non_cured,day,month,year,week,weekday_number,hour,minute,am_pm
0,1,2020-01-30,6:00 PM,Kerala,1,0,0,0,1,8262,867,0,30,1,2020,5,3,6,0,1


In [56]:
# dropping unwanted column
df.drop(columns=["sno", "date", "time", "confirmedindiannational", "confirmedforeignnational"], axis=1, inplace=True)
df.head(1)

Unnamed: 0,state/unionterritory,cured,deaths,confirmed,oxygentanks,populationdensitypersqkm,non_cured,day,month,year,week,weekday_number,hour,minute,am_pm
0,Kerala,0,0,1,8262,867,0,30,1,2020,5,3,6,0,1


In [57]:
# Label envode state
le = LabelEncoder()
df["state/unionterritory"] = le.fit_transform(df["state/unionterritory"])
df.head(1)

Unnamed: 0,state/unionterritory,cured,deaths,confirmed,oxygentanks,populationdensitypersqkm,non_cured,day,month,year,week,weekday_number,hour,minute,am_pm
0,18,0,0,1,8262,867,0,30,1,2020,5,3,6,0,1


In [58]:
# standarization
ss = StandardScaler()
cols = ["deaths", "cured", "oxygentanks", "confirmed", "populationdensitypersqkm"]

for i in cols:
    df[i] = ss.fit_transform(df[[i]])

df.head(3)

Unnamed: 0,state/unionterritory,cured,deaths,confirmed,oxygentanks,populationdensitypersqkm,non_cured,day,month,year,week,weekday_number,hour,minute,am_pm
0,18,-0.480563,-0.378187,-0.475776,1.121687,1.805302,0,30,1,2020,5,3,6,0,1
1,18,-0.480563,-0.378187,-0.475776,1.423894,-2.184478,1,31,1,2020,5,4,6,0,1
2,18,-0.480563,-0.378187,-0.475774,-0.412169,-0.922203,2,1,2,2020,5,5,6,0,1


In [59]:
cols = ["deaths", "cured", "oxygentanks", "confirmed", "populationdensitypersqkm"]
for i in cols:
    df = df[(df[i] < 3) & (df[i] > -3)]
df.head(1)

Unnamed: 0,state/unionterritory,cured,deaths,confirmed,oxygentanks,populationdensitypersqkm,non_cured,day,month,year,week,weekday_number,hour,minute,am_pm
0,18,-0.480563,-0.378187,-0.475776,1.121687,1.805302,0,30,1,2020,5,3,6,0,1


In [60]:
# export as preprocessed data
df.to_csv("../csv/preprocessed_train_data.csv", index=False, mode="w")