In [None]:
pip install dataprep

In [None]:
pip install plotly

In [None]:
'''
Import the Libraries
'''


# DATA MANIPULATION
import pandas as pd 
import numpy as np 

#DATA VISUALISATION
import matplotlib.pyplot as plt 

covid_data_train = pd.read_csv("../input/covid19-global-forecasting-week-5/train.csv") # csv-Comma Separated Values
covid_data_test = pd.read_csv("../input/covid19-global-forecasting-week-5/test.csv")

In [None]:
covid_data_train.head() # Will show first five rows 
                        # NaN-Not a number

In [None]:
covid_data_train.shape # (number of rows, number of columns)

In [None]:
covid_data_train.dtypes.value_counts().plot.pie(explode=[0.1,0.1,0.1],autopct='%1.1f%%',shadow=True)
plt.title('Data type');

In [None]:
covid_data_train.isnull().sum() # This code tells me how many null values are there in each column

In [None]:
covid_data_test.head()

In [None]:
covid_data_train[covid_data_train['Country_Region'] == 'India'] # This code gives me data for India

In [None]:
covid_data_train[covid_data_train['Country_Region'] == 'China'] # This code shows me the data for China

In [None]:
covid_data_train[covid_data_train['Country_Region'] == 'India']['County'].value_counts() # Return's the count for that particular

In [None]:
covid_data_test.isnull().sum()

In [None]:
covid_data_train.dropna(axis=1, inplace = True) # Drop the null columns-County and Province
covid_data_test.dropna(axis=1, inplace=True) # Drop the null columns

In [None]:
import plotly.express as px # Data visualisation using plotly

fig = plt.figure(figsize = (45,30)) # intialize the figure
fig = px.pie(covid_data_train, names = 'Country_Region', values = 'TargetValue', color_discrete_sequence = px.colors.sequential.RdBu, hole = 0.4) # Plot the pieplot
fig.update_traces(textposition = 'inside') # Update the tracing 

#fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide') # Update the layout
fig.show()

In [None]:
# Visualise the counts of confimred cases and the fatalities
import seaborn as sns
sns.barplot(y = 'TargetValue', x='Target', data = covid_data_train)
plt.show()

In [None]:
# Visualise the count of target w.r.t population
fig = px.pie(covid_data_train, values='TargetValue', names='Target')
fig.update_traces(textposition='inside')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig.show()

In [None]:
confirmed=covid_data_train[covid_data_train['Target']=='ConfirmedCases']
fig = px.treemap(confirmed, path=['Country_Region'], values='TargetValue',width=900, height=600)
fig.update_traces(textposition='middle center', textfont_size=15)
fig.update_layout(
    title={
        'text': 'Total Share of Worldwide COVID19 Confirmed Cases',
        'y':0.92,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

In [None]:
dead=covid_data_train[covid_data_train['Target']=='Fatalities']
fig = px.treemap(dead, path=['Country_Region'], values='TargetValue',width=900,height=600)
fig.update_traces(textposition='middle center', textfont_size=15)
fig.update_layout(
    title={
        'text': 'Total Share of Worldwide COVID19 Fatalities',
        'y':0.92,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

In [None]:
# Visualise the top 10 most populos countries in the world and the covid cases
grouped_data = covid_data_train.groupby('Country_Region').sum()

# Top 10 most populous country
top_10_pop_countries=grouped_data.nlargest(10, 'Population')['TargetValue'] # I am able to extract top 10 most populous countries
top_10_pop_countries

In [None]:
# Visualise the number of confirmed covid cases and deaths i.e the Target Variable
fig = px.bar(x = top_10_pop_countries.index, y = top_10_pop_countries.values, title='Top 10 Most Populous Countries V/S Number of Covid Cases', labels = dict(x='Countries', y='Number of Covid-19 Cases'))
fig.show()

In [None]:
covid_data_train.info() # Information about your data

In [None]:
# Convert the date column into datetime format
covid_data_train['Date'] = pd.to_datetime(covid_data_train['Date'])
covid_data_test['Date'] = pd.to_datetime(covid_data_test['Date'])

In [None]:
# Visualise the worldwide covid growth w.r.t time

# 1) Group the data by date
date_grouped_data = covid_data_train.groupby('Date').sum()

# 2) Plot the date grouped data on a line chart
fig = px.line(x=date_grouped_data.index, y = date_grouped_data['TargetValue'], title = 'Growth of number of COVID-19 cases over Time', labels = dict(x='Date', y = 'Number of Coivd-19 Cases'))
fig.show()

In [None]:
covid_data_train[covid_data_train['Country_Region'] == 'India']

In [None]:
# Visulalisng the growth of covid w.r.t country over time
fig = px.line(covid_data_train, x = 'Date', y = 'TargetValue', color='Country_Region')
fig.show()

In [None]:
top_10_populous_countries = list(top_10_pop_countries.index)
top_10_populous_countries

In [None]:
top_10_most_pop_countries = covid_data_train[(covid_data_train['Country_Region'] == 'China') | (covid_data_train['Country_Region'] == 'India')|(covid_data_train['Country_Region'] == 'US')|(covid_data_train['Country_Region'] == 'Indonesia')|(covid_data_train['Country_Region'] == 'Brazil')|(covid_data_train['Country_Region'] == 'Pakistan')|(covid_data_train['Country_Region'] == 'Nigeria')|(covid_data_train['Country_Region'] == 'Bangladesh')|(covid_data_train['Country_Region'] == 'Russia')|(covid_data_train['Country_Region'] == 'Japan')]

In [None]:
# Visualise the growth of Covid-19 numbers in top 10 most populous countries
fig = px.line(top_10_most_pop_countries, x='Date', y='TargetValue', color='Country_Region')
fig.show()

# **DATA PROCESSING**

In [None]:
covid_data_train.drop(['Id'],axis=1, inplace=True) # Drop the ID column

In [None]:
covid_data_train.head()

In [None]:
covid_data_train['Date']

In [None]:
'''2020-01-23'''

2020123

2020*1000 + 1*100 + 23*1

In [None]:
# Convert the Date time variables into integers
def to_integer(x):
  '''Covnerts a datetime variable into integer.'''
  return 1000*x.year + 100*x.month + x.day

# apply the above method on the dataframe
covid_data_train['Date'] = covid_data_train['Date'].apply(lambda x : to_integer(x))

In [None]:
# Perform same on the test_Set
covid_data_test['Date'] = covid_data_test['Date'].apply(lambda x : to_integer(x))

In [None]:
covid_data_test

In [None]:
# Label Encoding the categorical columns
from sklearn.preprocessing import LabelEncoder

# LabelEncoder Object
le1 = LabelEncoder()

# fit the categorical columns
covid_data_train['Country_Region'] = le1.fit_transform(covid_data_train['Country_Region'])
covid_data_test['Country_Region'] = le1.transform(covid_data_test['Country_Region'])

le2 = LabelEncoder()
covid_data_train['Target'] = le2.fit_transform(covid_data_train['Target'])
covid_data_test['Target'] = le2.transform(covid_data_test['Target'])

In [None]:
# Split the data into X and y
X = covid_data_train.drop('TargetValue', axis=1)
y = covid_data_train['TargetValue']

In [None]:
X

In [None]:
test_id = covid_data_test['ForecastId']

# Now drop the ForecastId from the test set
covid_data_test.drop(['ForecastId'], axis = 1, inplace=True)

In [None]:
X

In [None]:
# merge the training and testing set
train_len = X.shape[0]
combined_data = pd.concat((X,covid_data_test))

In [None]:
# Standardize the data using StandardScaler
from sklearn.preprocessing import StandardScaler

# StandardScaler object
ss = StandardScaler()

# Fit the combined set
for column in combined_data.columns:
  combined_data[column] = ss.fit_transform(combined_data[column].values.reshape((-1,1)))

In [None]:
combined_data.head()

In [None]:
# split the combined data back into train and test
X = combined_data.iloc[ : train_len]
testing_data = combined_data.iloc[train_len:]

In [None]:
# Split X and y into training and testing set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
X_train.shape

# **Machine Learning Modelling**

In [None]:
# Machine Learning Modeling
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor # Import the random forest

# Model Object
rf = RandomForestRegressor()

# Fit the model
rf.fit(X_train, y_train)