In [289]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [290]:
df = pd.read_csv("raw_train_data.csv")
df.columns = [i.lower() for i in df.columns]
df.head(3)

Unnamed: 0,sno,date,time,state/unionterritory,confirmedindiannational,confirmedforeignnational,cured,deaths,confirmed,oxygentanks,populationdensitypersqkm
0,1,2020-01-30,6:00 PM,Kerala,1,0,0,0,1,8262,867
1,2,2020-01-31,6:00 PM,Kerala,1,0,0,0,1,9136,61
2,3,2020-02-01,6:00 PM,Kerala,2,0,0,0,2,3826,316


In [291]:
# checking is null
df.isnull().any()

sno                         False
date                        False
time                        False
state/unionterritory        False
confirmedindiannational     False
confirmedforeignnational    False
cured                       False
deaths                      False
confirmed                   False
oxygentanks                 False
populationdensitypersqkm    False
dtype: bool

In [292]:
# seperate date

df["date"] = pd.to_datetime(df["date"])
df["day"] = df["date"].dt.day
df["month"] = df["date"].dt.month
df["year"] = df["date"].dt.year
df["week"] = df["date"].dt.isocalendar().week
df["weekday_number"] = df["date"].dt.weekday

In [293]:
# seperate time
def hour_conversion(date):
    hour = date.hour
    am_pm = 1 if hour < 12 else 0  # 1 for AM, 0 for PM
    hour = hour % 12 if hour % 12 != 0 else 12  # Convert to 12-hour format

    return hour, am_pm

df["time"] = pd.to_datetime(df["time"], format="%H:%M %p")
df["hour"], df["am_pm"] = zip(*df["time"].apply(hour_conversion))
df["minute"] = df["time"].dt.minute

# Drop the original "time" column
df = df.drop(["time", "date"], axis=1)
df.head(3)

Unnamed: 0,sno,state/unionterritory,confirmedindiannational,confirmedforeignnational,cured,deaths,confirmed,oxygentanks,populationdensitypersqkm,day,month,year,week,weekday_number,hour,am_pm,minute
0,1,Kerala,1,0,0,0,1,8262,867,30,1,2020,5,3,6,1,0
1,2,Kerala,1,0,0,0,1,9136,61,31,1,2020,5,4,6,1,0
2,3,Kerala,2,0,0,0,2,3826,316,1,2,2020,5,5,6,1,0


In [None]:
# encode state
le = LabelEncoder()
df["state"] = le.fit_transform(df["state/unionterritory"])
df.drop("state/unionter")

In [301]:
# finding unique values
for i in df.columns:
    print(f"{i}: {df[i].unique()}")
    print

sno: [    1     2     3 ... 14999 15000 15001]
state/unionterritory: ['Kerala' 'Telengana' 'Delhi' 'Rajasthan' 'Uttar Pradesh' 'Haryana'
 'Ladakh' 'Tamil Nadu' 'Karnataka' 'Maharashtra' 'Punjab'
 'Jammu and Kashmir' 'Andhra Pradesh' 'Uttarakhand' 'Odisha' 'Puducherry'
 'West Bengal' 'Chhattisgarh' 'Chandigarh' 'Gujarat' 'Himachal Pradesh'
 'Madhya Pradesh' 'Bihar' 'Manipur' 'Mizoram'
 'Andaman and Nicobar Islands' 'Goa' 'Unassigned' 'Assam' 'Jharkhand'
 'Arunachal Pradesh' 'Tripura' 'Nagaland' 'Meghalaya'
 'Dadra and Nagar Haveli and Daman and Diu'
 'Cases being reassigned to states' 'Sikkim' 'Daman & Diu' 'Lakshadweep'
 'Telangana']
confirmedindiannational: [  1.           2.           3.           0.           6.
   7.           8.           9.           4.          15.
   5.          17.          10.          11.          19.
  14.          22.          32.          12.          23.
  24.          36.          25.          39.          44.
  18.          16.          26.          49

In [299]:
# handling missing values
# Replace '-' with NaN
df["confirmedindiannational"] = df["confirmedindiannational"].replace("-", np.NaN)
df["confirmedforeignnational"] = df["confirmedforeignnational"].replace("-", np.NaN)

# Convert column to numeric
df["confirmedindiannational"] = pd.to_numeric(df["confirmedindiannational"])
df["confirmedforeignnational"] = pd.to_numeric(df["confirmedforeignnational"])

# Fill missing values with mean
df["confirmedindiannational"] = df["confirmedindiannational"].fillna(df["confirmedindiannational"].mean())
df["confirmedforeignnational"] = df["confirmedforeignnational"].fillna(df["confirmedforeignnational"].mean())

In [304]:
# standarization
ms = MinMaxScaler()
df["confirmedindiannational"] = ms.fit_transform(df[["confirmedindiannational"]])
df["confirmedforeignnational"] = ms.fit_transform(df[["confirmedforeignnational"]])
df["cured"] = ms.fit_transform(df[["cured"]])
df["deaths"] = ms.fit_transform(df[["deaths"]])
df["oxygentanks"] = ms.fit_transform(df[["oxygentanks"]])
df["confirmed"] = ms.fit_transform(df[["confirmed"]])

In [305]:
# exporting csv
df.to_csv("preprocessed_train_data.csv", index=False)