In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Loading the dataset for training
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
print(f'Shape of the train data: {train_df.shape}')
print(f'Shape of the test data: {test_df.shape}')

In [None]:
# Getting an overview of the data
train_df.head()

In [None]:
# Viewing the random rows of the data
train_df.sample(10)

In [None]:
train_df.Country_Region.value_counts()

In [None]:
# Statistical summary of the data
train_df.describe()

In [None]:
# Getting the information about the data
train_df.info()

# <font color = 'pink'><b>Data Preprocessing</b></font>

In [None]:
# Convert Date to datetime format
train_df['Date'] = pd.to_datetime(train_df['Date'])
test_df['Date'] = pd.to_datetime(test_df['Date'])

In [None]:
train_df.info()

In [None]:
# Checking missing values in the dataset
train_df.isnull().sum()

In [None]:
train_df.describe()

In [None]:
train_df['Country_Region'].value_counts()

In [None]:
train_df[train_df['Province_State'] == 'New York']

### Filling missing values

### Train data

In [None]:
# Filling missing values of Province_State with Country_Region
train_df['Province_State'].fillna(train_df['Country_Region'], inplace=True)

# Filling missing values of County with Provice_State
train_df['County'].fillna(train_df['Province_State'], inplace=True)

# if still there are missing values, drop them
train_df.dropna(inplace=True)

In [None]:
# Filling missing values of Province_State with Country_Region
test_df['Province_State'].fillna(test_df['Country_Region'], inplace=True)

# Filling missing values of County with Provice_State
test_df['County'].fillna(test_df['Province_State'], inplace=True)

# if still there are missing values, drop them
test_df.dropna(inplace=True)

In [None]:
train_df.isnull().sum()

In [None]:
train_df.sample(10)

In [None]:
# train_df.iloc[66680:66700]
# train_df.iloc[22980:23000]
# train_df.iloc[44780:44800]

In [None]:
# Extracting the total count of confirmed cases by country
country_with_confirmed_cases = train_df[train_df['Target'] == 'ConfirmedCases'].groupby('Country_Region')['TargetValue'].sum().sort_values(ascending=False)[:10]

In [None]:
country_with_confirmed_cases

In [None]:
# Bar Chart to show top 10 countries with confirmed cases
plt.style.use('ggplot')
country_with_confirmed_cases.plot(kind='bar', figsize=(10, 5))
plt.title('Top 10 Countries with Confirmed Cases')
plt.xlabel('Country')
plt.ylabel('Confirmed Cases')

In [None]:
# Countries with Fatalities
country_with_fatality_cases = train_df[train_df['Target'] == 'Fatalities'].groupby('Country_Region')['TargetValue'].sum().sort_values(ascending=False)[:10]

In [None]:
country_with_fatality_cases

In [None]:
# Bar chart to show the top 10 countries with fatality cases
country_with_fatality_cases.plot(kind='bar', figsize=(10, 5))
plt.title('Top 10 Countries with Fatality Cases')
plt.xlabel('Country')
plt.ylabel('Fatality Cases')

In [None]:
# plt.style.available

In [None]:
# Finding Countries with Recovered Cases, by subtracting fatality cases from confirmed cases
recovered_cases_by_country = pd.DataFrame(country_with_confirmed_cases - country_with_fatality_cases)

In [None]:
type(recovered_cases_by_country) # Confirming the type of the object

In [None]:
# Dropping the null or missing values
recovered_cases_by_country.dropna(inplace=True)

In [None]:
# Sorting the values in descending order
recovered_cases_by_country.sort_values(by='TargetValue', ascending=False, inplace=True)

In [None]:
recovered_cases_by_country

In [None]:
# Bar chart to show the top countries with recovered cases
recovered_cases_by_country.plot(kind='bar', figsize=(12, 6))
plt.title('Top 10 Countries with Recovered Cases')
plt.xlabel('Country')
plt.xticks(rotation=45)
plt.ylabel('Recovered Cases')

# <font color = 'pink'><b>Feature Engineering</b></font>

In [None]:
train_df.head()

In [None]:
# Adding new features in training data
train_df['day'] = train_df['Date'].dt.day
train_df['month'] = train_df['Date'].dt.month
train_df['dayofweek'] = train_df['Date'].dt.dayofweek
train_df['dayofyear'] = train_df['Date'].dt.dayofyear
train_df['quarter'] = train_df['Date'].dt.quarter

# Adding same features in the testing data also
test_df['day'] = test_df['Date'].dt.day
test_df['month'] = test_df['Date'].dt.month
test_df['dayofweek'] = test_df['Date'].dt.dayofweek
test_df['dayofyear'] = test_df['Date'].dt.dayofyear
test_df['quarter'] = test_df['Date'].dt.quarter


In [None]:
train_df

In [None]:
# Encoding the Target Column

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# Encoding the Target Column
train_df['Target'] = le.fit_transform(train_df['Target'])

# Encoding the Country_Region Column
train_df['Country_Region'] = le.fit_transform(train_df['Country_Region'])

# Encoding the Target Column in testing data
test_df['Target'] = le.fit_transform(test_df['Target'])

# Encoding the Country_Region Column in testing data
test_df['Country_Region'] = le.fit_transform(test_df['Country_Region'])

In [None]:
def to_integer(dt_time):
    return 10000*dt_time.year + 100*dt_time.month + dt_time.day
train_df['Date']=pd.to_datetime(train_df['Date'])
test_df['Date']=pd.to_datetime(test_df['Date'])

In [None]:
test_df['Date']=test_df['Date'].dt.strftime("%Y%m%d").astype(int)
train_df['Date']=train_df['Date'].dt.strftime("%Y%m%d").astype(int)

In [None]:
train_df

In [None]:
train_df.info()

In [None]:
test_df

In [None]:
test_df.info()

In [None]:
# Selecting the features and target variable
X = train_df.drop(['Id', 'TargetValue', 'County', 'Province_State', 'Target'], axis=1)
y = train_df['TargetValue']

In [None]:
# Splitting the dataset into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Checking the shape of the training and testing set
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

model = RandomForestRegressor(n_estimators=100, random_state=42)

pipeline = Pipeline([('scaler2' , StandardScaler()),
                        ('RandomForestRegressor: ', model)])
pipeline.fit(X_train , y_train)
prediction = pipeline.predict(X_test)

In [None]:
prediction

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, prediction)

In [None]:
train_df.shape, test_df.shape

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
Xtest = test_df.drop(['ForecastId', 'County', 'Province_State', 'Target'], axis=1)

In [None]:
ypred = model.predict(Xtest)

In [None]:
ypred

# <font color = 'Yellow'><b>Model 2</b></font>

In [None]:
train_df.head()

In [None]:
train_df.columns

In [None]:
X2 = train_df.drop(['Id', 'TargetValue', 'County', 'Province_State', 'Target', 'Country_Region'], axis=1)
y2 = train_df['TargetValue']

In [None]:
X2.info()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

X2 = train_df.drop(['Id', 'TargetValue', 'County', 'Province_State', 'Target', 'Country_Region'], axis=1)
y2 = train_df['TargetValue']

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=42)

model2 = RandomForestRegressor(n_estimators=100, random_state=42)

pipeline2 = Pipeline([('scaler2' , StandardScaler()), ('RandomForestRegressor: ', model2)])
pipeline2.fit(X_train2 , y_train2)

In [None]:
ypred2 = pipeline2.predict(X_test2)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test2, ypred2)

In [None]:
# Saving the model
# import pickle

# with open('covid_model.pkl', 'wb') as model_file:
#     pickle.dump(model2, model_file)