In [None]:
# import basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# read Data from  
df = pd.read_csv('/kaggle/input/covid19-in-india/StatewiseTestingDetails.csv',parse_dates=['Date'])
df

In [None]:
## check how many nan values in each columns
df.isnull().sum()

In [None]:
df.dtypes

## Data Cleaning

In [None]:
## here negative is string type we have to convert into float
## remmeber one thing don't try to convert into int because Nan values can't be converted into int.

# if we try to convert type of Negative column into float then we got following error 
# df['Negative'] = pd.to_numeric(df['Negative'])
# error : ValueError: Unable to parse string " " at position 4838


here we face an error bcz there is any row in dataset which contains " " string in negative column.

let's drop that column.



In [None]:
df[df['Negative']==" "]

In [None]:
# let's remove this one
df = df.drop(index=[4838])
df['Negative'] = df['Negative'].astype('float')

In [None]:
## sort data by date column
df = df.sort_values('Date').reset_index(drop=True)
df

## Data Visualization 
### here i uploaded pdf of visuals you can check i had use power bi tool fo visualization.

In [None]:
from IPython.display import IFrame
IFrame('/kaggle/input/corona-report/corona report.pdf',900,700)

### due to some issues above pdf is not visible. i have uploaded the ppt version of data visualization you can download it from my input data attachments. :)

In [None]:
%%html
### below is html code to show my power bi report you can view this if you have power bi account
<iframe width="1140" height="541.25" src="https://app.powerbi.com/reportEmbed?reportId=5f6f63fe-bd4a-4563-8330-c6e389d8d83a&autoAuth=true&ctid=4bc9df11-1291-4d7a-9015-aa308945e8dd&config=eyJjbHVzdGVyVXJsIjoiaHR0cHM6Ly93YWJpLWluZGlhLWNlbnRyYWwtYS1wcmltYXJ5LXJlZGlyZWN0LmFuYWx5c2lzLndpbmRvd3MubmV0LyJ9" frameborder="0" allowFullScreen="true"></iframe>

In [None]:
## check how many rows have negative and positive both values are nan
df[(pd.isnull(df['Negative'])) & (pd.isnull(df['Positive']))]

There are many nans values and we can't drop all those nan values.

i have one idea to fill nan values.

positive value = total samples * (Average of ratio of positive/total samples values)

negative value = total samples * (Average of ratio of negative/total samples values)

and this average of ratio of ... its depend on state.


In [None]:
## let's find avg positive and negative ratio for all states
states_ratios = {}
for i in df['State'].unique():
    # get data of particular state
    t = df[df['State']==i]
    ## first drop all nan values and then after find ratios
    t = t.dropna(subset=['Negative','Positive','TotalSamples'])
    t['ratio_pos'] = t['Positive']/t['TotalSamples']
    t['ratio_neg'] = t['Negative']/t['TotalSamples']
#     if(pd.isnull(t['ratio_pos'].mean()) or pd.isnull(t['ratio_neg'].mean())):
#         pass
#     else:
    states_ratios[i] = [t['ratio_pos'].mean() , t['ratio_neg'].mean()]
    

In [None]:
states_ratios

In [None]:
## make copy of df for future usecase
df2 = df.copy()
df2

In [None]:
df2 = df2.reset_index(drop=True)
df2

In [None]:
# fill nan values in df2
for i in range(len(df2)):
    
    if(pd.isnull(df2.loc[i,'Negative'])):
        df2.loc[i,'Negative'] = df2.loc[i,'TotalSamples'] * states_ratios[df2.loc[i,'State']][1]
    
    if(pd.isnull(df2.loc[i,"Positive"])):
        df2.loc[i,'Positive'] = df2.loc[i,'TotalSamples'] * states_ratios[df2.loc[i,'State']][0]

In [None]:
## after filling nan values if we show data set so there are something nan values
df2[pd.isnull(df2['Negative'])]

In [None]:
## for manipur we have nan values of both the ratios in states_dict 
## to fill nan in manipur we consider ratios of mizoram 
## bcz they both states have same (Not exactly) number of cases and they both are neighbor states.

In [None]:
states_ratios['Mizoram']

In [None]:
for i in df2[df['State']=='Manipur'].index:
    if(pd.isnull(df2.loc[i,'Negative'])):
        df2.loc[i,'Negative'] = df2.loc[i,'TotalSamples'] * 0.0001
    
    if(pd.isnull(df2.loc[i,"Positive"])):
        df2.loc[i,'Positive'] = df2.loc[i,'TotalSamples'] * 0.017241379310344827

In [None]:
df2.isnull().sum()

## Now all nan values are filled.
## we did great job !!

In [None]:
## convert positive and negative values int0 int bcz float values (ex 234.67) are not valid.
df2['Negative'] = df2['Negative'].astype('int')
df2['Positive'] = df2['Positive'].astype('int')
df2

In [None]:
df3 = df2.copy()
# here we replace our index with date 
# its make our work easy whenever we want to plot any graph or chart
df3.index = df3['Date']
df3.drop('Date',axis=1)
df3

In [None]:
## just plot demo chart
df3[df3['State']=='Gujarat']['Positive'].plot()
df3[df3['State']=='Gujarat']['Negative'].plot()

In [None]:
# save our data into storage.
df2.to_csv("CleanedCovidData.csv",index=False)

In [None]:
## read csv data
data = pd.read_csv('CleanedCovidData.csv',parse_dates=['Date'])
data

In [None]:
## check for total null values in each columns
data.isnull().sum()

In [None]:
# check distribution for state column
data['State'].value_counts()

In [None]:
## date type is not acceptable when we go for building model so split date columns into 3 columns(year,month,day)

data['year'] = data['Date'].apply(lambda x: x.year)
data['month'] = data['Date'].apply(lambda x: x.month)
data['day'] = data['Date'].apply(lambda x: x.day)
data

In [None]:
## just copy data bcz in future if want to plotprediction at that time we take date columns as x axis so copy it
# for future refrences
data1 = data.copy()
## just drop date column
data = data.drop('Date',axis=1)
## reorder our columns so its easy to analyse data
data = data[['year','month','day','State','TotalSamples','Negative','Positive']]
data

In [None]:
## here we are using labelencoder to deal with categorical features.
from sklearn.preprocessing import LabelEncoder
# use train_test_split to categorized our data into train and test
from sklearn.model_selection import train_test_split,cross_val_score
## import some well known models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor,RandomForestRegressor
from xgboost import XGBRegressor

le_state = LabelEncoder()


data['State'] = le_state.fit_transform(data['State'])


x_train,x_test,y_train,y_test = train_test_split(data.drop(['Negative','Positive'],axis=1),data[['Negative','Positive']],stratify=data['State'])
x_train.shape , x_test.shape,y_train.shape

In [None]:
## check correlation 

plt.figure(figsize=(10,8))
sns.heatmap(data.corr(),annot=True)

In [None]:
## fit linear model
lrmd = LinearRegression()
lrmd.fit(x_train,y_train)
lrmd.score(x_test,y_test)

In [None]:
# fit decision model
dcmd = DecisionTreeRegressor()
dcmd.fit(x_train,y_train)
dcmd.score(x_test,y_test)

In [None]:
# if your decision tree model gives good accuracy then you should also check for ExtraTreeRegressor model.
exmd = ExtraTreesRegressor()
exmd.fit(x_train,y_train)
exmd.score(x_test,y_test)

In [None]:
# fit xgboost regressor
## here i seperately fit xg model on positive data and negative data , bcz when we fit xg model target values must be in single array .
# and i fit xg model on multi target labels then its shown an error.
# we have to fit xg model 
xgmd_pos = XGBRegressor(n_estimators = 40)
xgmd_pos.fit(x_train,y_train['Positive'])
xgmd_pos.score(x_test,y_test['Positive'])

In [None]:
xgmd_neg = XGBRegressor()
xgmd_neg.fit(x_train,y_train['Negative'])
xgmd_neg.score(x_test,y_test['Negative'])
# xg model gives very nice accuracy

In [None]:
def getStateData(s):
    return data1[data1['State']==s]

def predictForState(d,s):
    s1 = le_state.transform([s])[0]
    d1 = d.copy()
    d1.loc[:,'State'] = s1
    preds = xgmd_pos.predict(d1[['year','month','day','State','TotalSamples']])
    return preds



def plotPredictionsForState(s):
    guj = getStateData(s)
    plt.figure(figsize=(12,8))
    plt.plot(guj['Date'],guj['Positive'],label="Actual")
    plt.plot(guj['Date'],predictForState(guj,s),label="Predicted")
    plt.legend()
    plt.title(f"ACtual and Predicted plot for {s}")
    plt.show()

In [None]:
plotPredictionsForState('Gujarat')
plotPredictionsForState('Kerala')
plotPredictionsForState('Maharashtra')

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(y= x_train.columns ,x =xgmd_pos.feature_importances_)
plt.title("Feature importances for XGBoostRegressor model")

In [None]:
import joblib

joblib.dump(xgmd_pos,"XGBoost_Positive.pkl")
joblib.dump(xgmd_neg,"XGBoost_Negative.pkl")

For Year columns its show feature importances is 0 because value of year columns is 2020 its same for whole data
we can delete that columns for now but infuture when we work on data of year 2021 at that time we have to consider this year column.

I hope its helpful for and if you learn something new from this then please UpVote me

Thank You :)