### This project is intended for Beginner Machine Learning/Data Analysis
- EE257 Machine Learning for Electrical Engineers - San Jose State University

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
dfSubmission = pd.read_csv(dirname + '/submission.csv')
dfTest = pd.read_csv(dirname + '/test.csv')  
dfTrain = pd.read_csv(dirname + '/train.csv')
dfSubmission.head()

In [None]:
dfTest.head()

In [None]:
dfTest.shape

In [None]:
dfTrain.head()

In [None]:
dfTrain.corr()

In [None]:
dfTrain.info()

In [None]:
dfTrain.shape

In [None]:
dfTrain.describe()

In [None]:
TargetValue = dfTrain["TargetValue"]

## Dataset Cleaning - Dataset Concatenate

In [None]:
dfTrain["Date"] = dfTrain["Date"].apply(pd.to_datetime, "%m/%d/%Y")
dfTest["Date"] = dfTest["Date"].apply(pd.to_datetime, "%m/%d/%Y")

In [None]:
'''confirmedTotal = dfTrain.groupby(['Date']).agg({'TargetValue':['sum']})
fatalTotal = dfTrain.groupby(['Date']).agg({'TargetValue':['sum']})
totalDate = confirmedTotal.join(fatalTotal)'''

In [None]:
#print(totalDate)

In [None]:
#confirmedCases = totalDate.iloc[0:100,0]
#print(confirmedCases)
#fatalities = totalDate.iloc[0:100,1]
#print(fatalities)

In [None]:
#cases = totalDate.iloc[0:100,1:2]
#print(cases)

In [None]:
#totalDate.to_csv(path+'/ConfirmFatal.csv',mode='a')

## Dataset Visualization
### Comapring Italy, China, and United States Total Confirmed Cases vs Fatalities

In [None]:
Italy = dfTrain[dfTrain['Country_Region']=='Italy']
Italy

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import style
Italy.set_index('Date', inplace=True)
Italy.groupby('Target')['TargetValue'].plot(legend=True)

In [None]:
China = dfTrain[dfTrain['Country_Region']=='China']
China

In [None]:
China.set_index('Date', inplace=True)
China.groupby('Target')['TargetValue'].plot(legend=True)

In [None]:
US = dfTrain[dfTrain['Country_Region']=='US']
US

In [None]:
US.set_index('Date', inplace=True)
US.groupby('Target')['TargetValue'].plot(legend=True)

In [None]:
CA = dfTrain[dfTrain['Province_State']=='California']
CA

In [None]:
CA.set_index('Date', inplace=True)
CA.groupby('Target')['TargetValue'].plot(legend=True)

In [None]:
NY = dfTrain[dfTrain['Province_State']=='New York']
NY

In [None]:
NY.set_index('Date', inplace=True)
NY.groupby('Target')['TargetValue'].plot(legend=True)

In [None]:
TX = dfTrain[dfTrain['Province_State']=='Texas']
TX

In [None]:
TX.set_index('Date', inplace=True)
TX.groupby('Target')['TargetValue'].plot(legend=True)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import style
dfTrain.hist(bins = 20, figsize = (10,10)) 
plt.show()

### Oberservations:
- Data can be skewed due to missing data that would be important - total tests conducted for COVID19 as well as the number of people who tested positive or negative in each country. Testing kits are highly scarce at the moment - the Bay Area is only performing tests for people who are showing symptoms for COVID19 such as fevers. Although there are companies such as Cepheid and Abbott creating kits that can give results within a few hours.
- It is also worth noting that New York will now start to report probable COVID19 deaths in addition to confirmed COVID19 deaths so the fatality rate may be exponentially higher within the next few days.
--https://nypost.com/2020/04/07/scores-of-probable-coronavirus-deaths-are-not-being-counted-by-the-city/
- It was interesting to see that California shows a lower growth in inficted cases compared to New York. California's first confirmed case was approximately 10 days before New York. One possible reason could be due to New York's dense population compared to California

## Data Extraction

In [None]:
dfTrain = dfTrain.drop(['County','Province_State','Country_Region','Target'],axis=1) 
dfTest = dfTest.drop(['County','Province_State','Country_Region','Target'],axis=1)
dfTrain.head()

In [None]:
test_date_min = dfTest['Date'].min()
test_date_max = dfTest['Date'].max()
dfTrain['Date']=pd.to_datetime(dfTrain['Date'])
dfTest['Date']=pd.to_datetime(dfTest['Date'])
dfTest['Date']=dfTest['Date'].dt.strftime("%Y%m%d")
dfTrain['Date']=dfTrain['Date'].dt.strftime("%Y%m%d").astype(int)

In [None]:
dfTest.drop(['ForecastId'],axis=1,inplace=True)
dfTest.index.name = 'Id'
dfTest.head()

In [None]:
from sklearn.model_selection import train_test_split
X = dfTrain.drop(['TargetValue', 'Id'], axis=1)
y = dfTrain["TargetValue"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 50)

## 1. Prediction - Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

In [None]:
RFR = RandomForestRegressor(n_jobs=-1,n_estimators=100)
RFR.fit(X_train,y_train)
accuracy_train = RFR.score(X_train, y_train)
accuracy_test = RFR.score(X_test, y_test)
print(f'train accuracy: {accuracy_train:.3f}')
print(f'test accuracy : {accuracy_test:.3f}')

In [None]:
# Trying out Boosting performance
# Edit: Since RandomForest has better performance than this, we will be using
#       Random Forest to fit the test data
Boost = XGBRegressor(n_estimators=1000,random_state=0,n_jobs=-1)
Boost.fit(X_train,y_train)
Boost.score(X_test,y_test)
accuracy_train = Boost.score(X_train, y_train)
accuracy_test = Boost.score(X_test, y_test)
print(f'train accuracy: {accuracy_train:.3f}')
print(f'test accuracy : {accuracy_test:.3f}')

In [None]:
RFR2 = RandomForestRegressor(n_jobs=-1,n_estimators=100,
                           min_samples_split = 2,max_leaf_nodes = 1000)
RFR2.fit(X_train,y_train)
accuracy_train = RFR2.score(X_train, y_train)
accuracy_test = RFR2.score(X_test, y_test)
print(f'train accuracy: {accuracy_train:.3f}')
print(f'test accuracy : {accuracy_test:.3f}')

In [None]:
RFR2 = RandomForestRegressor(n_jobs=-1,n_estimators=100,
                           min_samples_split = 2)
RFR2.fit(X_train,y_train)
accuracy_train = RFR2.score(X_train, y_train)
accuracy_test = RFR2.score(X_test, y_test)
print(f'train accuracy: {accuracy_train:.3f}')
print(f'test accuracy : {accuracy_test:.3f}')

In [None]:
RFR2 = RandomForestRegressor(n_jobs=-1,n_estimators=100,
                           min_samples_split = 4,max_leaf_nodes = 1000)
RFR2.fit(X_train,y_train)
accuracy_train = RFR2.score(X_train, y_train)
accuracy_test = RFR2.score(X_test, y_test)
print(f'train accuracy: {accuracy_train:.3f}')
print(f'test accuracy : {accuracy_test:.3f}')

In [None]:
RFR2 = RandomForestRegressor(n_jobs=-1,n_estimators=100,
                           min_samples_split = 4,max_leaf_nodes = 1000,
                            oob_score = [True,False])
RFR2.fit(X_train,y_train)
accuracy_train = RFR2.score(X_train, y_train)
accuracy_test = RFR2.score(X_test, y_test)
print(f'train accuracy: {accuracy_train:.3f}')
print(f'test accuracy : {accuracy_test:.3f}')

In [None]:
# Varying (+) min_samples_split and max_leaf_nodes results in ~0.2% increase in training performance at the
# cost of longer computing time
RFR2 = RandomForestRegressor(n_jobs=-1,n_estimators=100,
                           min_samples_split = 2,max_leaf_nodes = 1000,
                            oob_score = True)
RFR2.fit(X_train,y_train)
accuracy_train = RFR2.score(X_train, y_train)
accuracy_test = RFR2.score(X_test, y_test)
print(f'train accuracy: {accuracy_train:.3f}')
print(f'test accuracy : {accuracy_test:.3f}')

In [None]:
RFR = RandomForestRegressor(n_jobs=-1,n_estimators=100,
                           min_samples_split = 2,max_leaf_nodes = 1000)
RFR.fit(X_train,y_train)
accuracy_train = RFR2.score(X_train, y_train)
accuracy_test = RFR2.score(X_test, y_test)
print(f'train accuracy: {accuracy_train:.3f}')
print(f'test accuracy : {accuracy_test:.3f}')

In [None]:
prediction = RFR.predict(dfTest)

pred_list = [int(x) for x in prediction]

output = pd.DataFrame({'Id': dfTest.index,'TargetValue':pred_list})
print(output)

In [None]:
a=output.groupby(['Id'])['TargetValue'].quantile(q=0.05).reset_index()
b=output.groupby(['Id'])['TargetValue'].quantile(q=0.5).reset_index()
c=output.groupby(['Id'])['TargetValue'].quantile(q=0.95).reset_index()

a.columns=['Id','q0.05']
b.columns=['Id','q0.5']
c.columns=['Id','q0.95']
a=pd.concat([a,b['q0.5'],c['q0.95']],1)
a['q0.05']=a['q0.05']
a['q0.5']=a['q0.5']
a['q0.95']=a['q0.95']
a['Id'] = a['Id'] + 1
a

In [None]:
sub=pd.melt(a, id_vars=['Id'], value_vars=['q0.05','q0.5','q0.95'])
sub['variable']=sub['variable'].str.replace("q","", regex=False)
sub['ForecastId_Quantile']=sub['Id'].astype(str)+'_'+sub['variable']
sub['TargetValue']=sub['value']
sub=sub[['ForecastId_Quantile','TargetValue']]
sub.reset_index(drop=True,inplace=True)
sub.to_csv("submission.csv",index=False)
sub