In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split 
from sklearn import metrics
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn import preprocessing

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor


In [None]:
PATH ='/kaggle/input/covid19-global-forecasting-week-3'

In [None]:
#Training Data
train = pd.read_csv(f'{PATH}/train.csv')
train.head()

In [None]:
train.info()

### Visualizing Training Data

In [None]:
total_countries_covid = train.groupby(['Country_Region'])['ConfirmedCases'].sum()


In [None]:
df = train.fillna('NA').groupby(['Country_Region','Province_State','Date'])['ConfirmedCases'].sum() \
                       .groupby(['Country_Region','Province_State']).max().sort_values() \
                          .groupby(['Country_Region']).sum().sort_values(ascending = False)

top20_countries = pd.DataFrame(df).head(20)
top20_countries

In [None]:
#index = np.arange(len(Country_Region))
plt.figure(figsize = (10,10))

top20_countries.plot(kind = 'bar')
plt.xlabel('Country', fontsize=5)
plt.ylabel('No of Confirmed Cases', fontsize=5)
plt.title('Confirmed Cases By Country')
plt.show()

In [None]:
import seaborn as sns
#ax = sns.barplot('Country_Region'.index, y="ConfirmedCases", data= top20_countries,palette="Blues_d")

In [None]:
total_countries_covid.head()

In [None]:
#Test Data
test = pd.read_csv(f'{PATH}/test.csv')
test.head()

In [None]:
test.info()

### Splitting training data into input and output

In [None]:
X = train.iloc[:,:4] # first 4 coloumns of dataframe with all rows :input values
print(X.shape)
y = train.iloc[:,4:] #output values
print(y.shape)

In [None]:
X.head()

In [None]:
y.head()

In [None]:
x = X.fillna('NA')

In [None]:
X_train = train
X_train['Date'] = pd.to_datetime(X_train['Date'], infer_datetime_format=True)
test['Date'] = pd.to_datetime(test['Date'], infer_datetime_format=True)


In [None]:
X_train.info()

In [None]:

EMPTY_VAL = "EMPTY_VAL"

def fillState(Province_State, Country_Region):
    if Province_State == EMPTY_VAL: return Country_Region
    return Province_State

In [None]:
X_train_new = X_train.copy()

X_train_new['Province_State'].fillna(EMPTY_VAL, inplace=True)
X_train_new['Province_State'] = X_train_new.loc[:, ['Province_State', 'Country_Region']].apply(lambda x : fillState(x['Province_State'], x['Country_Region']), axis=1)

X_train_new.loc[:, 'Date'] = X_train_new.Date.dt.strftime("%m%d")
X_train_new["Date"]  = X_train_new["Date"].astype(int)

X_train_new.head()

In [None]:
#Preprocessing 
l = preprocessing.LabelEncoder()

X_train_new.Country_Region = l.fit_transform(X_train_new.Country_Region)
X_train_new['Province_State'] = l.fit_transform(X_train_new['Province_State'])

X_train_new.head()


In [None]:
#Preprocessing Test Data

X_Test = test.copy()

X_Test['Province_State'].fillna(EMPTY_VAL, inplace=True)
X_Test['Province_State'] = X_Test.loc[:, ['Province_State', 'Country_Region']].apply(lambda x : fillState(x['Province_State'], x['Country_Region']), axis=1)

X_Test.loc[:, 'Date'] = X_Test.Date.dt.strftime("%m%d")
X_Test["Date"]  = X_Test["Date"].astype(int)

X_Test.head()

In [None]:
X_Test.info()

In [None]:
X_Test.Country_Region = l.fit_transform(X_Test.Country_Region)
X_Test['Province_State'] = l.fit_transform(X_Test['Province_State'])

X_Test.head()


In [None]:
X_train.head()
X_train.loc[X_train.Country_Region == 'Afghanistan', :]
#test.tail()

In [None]:
country =  X_train_new.Country_Region.unique()

In [None]:
X_train_new.info()

### Model 1: Linear Regression

In [None]:
p1_final = pd.DataFrame()
p3 = pd.DataFrame()
pred = pd.DataFrame()
p = list()
id_1 = pd.DataFrame()
id_2 = pd.DataFrame()
p1 =list()
#a = 1
#b = 1
import statsmodels.api as sm

for i in country:
    #pred = pd.concat([pred,p3], axis=0)
    id_1 = pd.concat([id_1,id_2])
    #p1_final = pd.concat([p1_final,p1])
    #p1.append(p)
    state = X_train_new.loc[X_train_new.Country_Region == i, :].Province_State.unique()
    for j in state:
       # b = b+1
        X_train_1 = X_train_new.loc[(X_train_new.Country_Region == i) & (X_train_new.Province_State == j), ['Province_State', 'Country_Region', 'Date', 'ConfirmedCases', 'Fatalities']]
        #Y_train = X_train_1.loc[:, 'ConfirmedCases':]
        Y_train1 =  X_train_1.loc[:, 'ConfirmedCases']
        Y_train2 = X_train_1.loc[:, 'Fatalities']
        
        #log_ytrain1 = np.log(Y_train1)
        #log_ytrain2 = np.log(Y_train2)
        
        X_train_1 = X_train_1.loc[:, ['Province_State', 'Country_Region', 'Date']]
        
        X_train_1.Country_Region = l.fit_transform(X_train_1.Country_Region)
        X_train_1['Province_State'] = l.fit_transform(X_train_1['Province_State'])
        
        X_test_1 = X_Test.loc[(X_Test.Country_Region == i) & (X_Test.Province_State == j), ['Province_State', 'Country_Region', 'Date', 'ForecastId']]
        #X_test_1 = X_Test.loc[(X_Test.Country_Region == i) & (X_Test.Province_State == j)]
       
        X_test_Id = X_test_1.loc[:, 'ForecastId']
        X_test_1 = X_test_1.loc[:, ['Province_State', 'Country_Region', 'Date']]
        
        X_test_1.Country_Region = l.fit_transform(X_test_1.Country_Region)
        X_test_1['Province_State'] = l.fit_transform(X_test_1['Province_State'])
        
       
        model_1 = XGBRegressor(n_estimators = 900)
       
        model_1.fit(X_train_1, Y_train1)
        y_poly_pred_1 = model_1.predict(X_test_1)
        
        model_2 = XGBRegressor(n_estimators = 900)
       
        model_2.fit(X_train_1, Y_train2)
        y_poly_pred_2 = model_2.predict(X_test_1)

        
        p3 = pd.DataFrame({'ForecastId': X_test_Id, 'ConfirmedCases': y_poly_pred_1, 'Fatalities': y_poly_pred_2})
        pred = pd.concat([pred,p3], axis=0)
        #p.append(y_pred_1)

        
        #p1 = pd.DataFrame(y_pred_1)

       # id_2 = pd.DataFrame(X_test_Id)
       # p1 = pd.DataFrame(p)
        #p1 = pd.concat([p1,y_pred_1])

        #print(y_pred_1)
       # p3 = pd.DataFrame(X_test_Id)
        #print(p3)
        #p1_final = pd.concat([p1],axis = 0)
        
        #print(pred)
#print(a,b)
#print(p1_final.iloc[:,0])
#p3 = pd.DataFrame({'ForecastId':id_1,'ConfirmedCases': p1_final.iloc[:,0], 'Fatalities':p1_final.iloc[:,-1]})

In [None]:
X_train_1.info()

In [None]:
pred.ConfirmedCases = round(pred.ConfirmedCases)
pred.Fatalities = round(pred.Fatalities)

pred.head()

In [None]:
pred.info()

In [None]:
#pred.ForecastId = pred.ForecastId.astype('int')
#pred['ForecastId'].astype(np.int64,errors='ignore')
#pred.tail()
pred.to_csv('submission.csv', index=False)