In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# dataset import

df = pd.read_csv('../input/covid19-data-set-of-turkey/tr-covid19.csv')
df.head()

In [None]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
# dataset backup
df_copy = df.copy()
df_copy.head()

In [None]:
df.rename(columns={"Total number of test":"totaltest",
                   "Total number of causes": "totalcauses",
                   "Total number of deaths":"totaldeaths",
                   "Number of critically ill patients":"totalcritically",
                   "Total Number of recoveries":"totalrecoveries",
                   "number of cases":"dailycases",
                   "number of test":"dailytest",
                   "number of deaths":"dailydeaths",
                   "Number of recoveries":"dailyrecoveries",
                   "Date": 'date',
                   "number of sick":'dailysick'},inplace=True)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
from datetime import datetime,date


df['date'] = pd.to_datetime(df['date'],format = '%d.%m.%Y')
df.head().style.set_properties(subset=['date'], **{'background-color':'red','color':'white'})

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
f, ax = plt.subplots(nrows=10, ncols=1, figsize=(20,30))

for i, column in enumerate(df.drop('date', axis=1).columns):
    sns.lineplot(x=df['date'], y=df[column].fillna(method='ffill'), ax=ax[i], color='dodgerblue')
    ax[i].set_title('Feature: {}'.format(column), fontsize=14)
    ax[i].set_ylabel(ylabel=column, fontsize=14)
                      
    ax[i].set_xlim([date(2020, 3, 11), date(2021, 5,26)])   

In [None]:
import plotly.express as px
startDate = '2020-11-25'
endDate = '2021-5-26'

In [None]:
fig = px.line(df, x='date', y='dailycases', range_x=[startDate,endDate])
fig.show()

In [None]:
fig = px.line(df, x='date', y='dailytest', range_x=[startDate,endDate])
fig.show()

In [None]:
fig = px.line(df, x='date', y='dailydeaths', range_x=[startDate,endDate])
fig.show()

In [None]:
fig = px.line(df, x='date', y='dailytest', range_x=[startDate,endDate])
fig.show()

In [None]:
fig = px.line(df, x='date', y='dailysick', title='Time Series with Range Slider and Selectors')

fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)
fig.show()

In [None]:
fig = px.line(df, x="date", y=df.drop("totaltest",axis=1).columns,
              hover_data={"date": "|%B %d, %Y"},
              title='custom tick labels with ticklabelmode="period"')
fig.update_xaxes(
    dtick="M1",
    tickformat="%b\n%Y",
    ticklabelmode="period")
fig.show()

In [None]:
import plotly.graph_objects as go

fig = go.Figure(go.Scatter(
    x = df['date'],
    y = df['dailysick']
))

fig.update_xaxes(
    rangeslider_visible=True,
    tickformatstops = [
        dict(dtickrange=[None, 1000], value="%H:%M:%S.%L ms"),
        dict(dtickrange=[1000, 60000], value="%H:%M:%S s"),
        dict(dtickrange=[60000, 3600000], value="%H:%M m"),
        dict(dtickrange=[3600000, 86400000], value="%H:%M h"),
        dict(dtickrange=[86400000, 604800000], value="%e. %b d"),
        dict(dtickrange=[604800000, "M1"], value="%e. %b w"),
        dict(dtickrange=["M1", "M12"], value="%b '%y M"),
        dict(dtickrange=["M12", None], value="%Y Y")
    ]
)

fig.show()

In [None]:
df = df.sort_values(by='date')

# Check time intervals
df['delta'] = df['date'] - df['date'].shift(1)

df[['date', 'delta']].head()

In [None]:
df['delta'].sum(), df['delta'].count()

In [None]:
df = df.drop('delta', axis=1)
df.isna().sum()

In [None]:
plt.figure(figsize=(15,7))
old_totaltest = df['totaltest'].copy()
df['totaltest'] = df['totaltest'].replace(0, np.nan)

sns.lineplot(x=df['date'], y=old_totaltest, color='darkorange')
sns.lineplot(x=df['date'], y=df['totaltest'].fillna(np.inf), color='dodgerblue')
plt.title('Total Test Graphic')
plt.xlabel('totaltest')
plt.ylabel('totaltest')
plt.xlim([date(2020, 3, 27), date(2021, 5,26)])

In [None]:
plt.figure(figsize=(15,7))
old_totalcritically = df['totalcritically'].copy()
df['totalcritically'] = df['totalcritically'].replace(0, np.nan)

sns.lineplot(x=df['date'], y=old_totalcritically, color='darkorange', label='original')
sns.lineplot(x=df['date'], y=df['totaltest'].fillna(np.inf), color='dodgerblue', label='modified')
plt.title('Total Critically Graphic')
plt.xlabel('totalcritically')
plt.ylabel('totalcritically')
plt.xlim([date(2020, 3, 27), date(2021,5,26)])

In [None]:
# Missing Values

f, ax = plt.subplots(nrows=1, ncols=1, figsize=(16,5))

sns.heatmap(df.T.isna(), cmap='Blues')
ax.set_title('Missing Values', fontsize=16)

for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(14)
plt.show()

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=3, sharex=True, figsize=(16,12))

sns.lineplot(df['date'], df['dailytest'], color='dodgerblue', ax=ax[0, 0])
ax[0, 0].set_title('dailytest', fontsize=14)

resampled_df = df[['date','dailytest']].resample('7D', on='date').sum().reset_index(drop=False)
sns.lineplot(resampled_df['date'], resampled_df['dailytest'], color='dodgerblue', ax=ax[1, 0])
ax[1, 0].set_title('Weekly Daily Test', fontsize=14)

resampled_df = df[['date','dailytest']].resample('M', on='date').sum().reset_index(drop=False)
sns.lineplot(resampled_df['date'], resampled_df['dailytest'], color='dodgerblue', ax=ax[2, 0])
ax[2, 0].set_title('Monthly Daily Test', fontsize=14)

for i in range(3):
    ax[i, 0].set_xlim([date(2021, 1, 1), date(2021,5,26)])

sns.lineplot(df['date'], df['dailycases'], color='dodgerblue', ax=ax[0, 1])
ax[0, 1].set_title('Daily Cases', fontsize=14)

resampled_df = df[['date','dailycases']].resample('7D', on='date').mean().reset_index(drop=False)
sns.lineplot(resampled_df['date'], resampled_df['dailycases'], color='dodgerblue', ax=ax[1, 1])
ax[1, 1].set_title('Weekly Daily Cases', fontsize=14)

resampled_df = df[['date','dailycases']].resample('M', on='date').mean().reset_index(drop=False)
sns.lineplot(resampled_df['date'], resampled_df['dailycases'], color='dodgerblue', ax=ax[2, 1])
ax[2, 1].set_title('Monthly Daily Cases', fontsize=14)

for i in range(3):
    ax[i, 1].set_xlim([date(2021, 1, 1), date(2021,5,26)])
plt.show()

In [None]:
# A year has 52 weeks (52 weeks * 7 days per week) aporx.
rolling_window = 52
f, ax = plt.subplots(nrows=2, ncols=1, figsize=(15, 12))

sns.lineplot(x=df['date'], y=df['dailytest'], ax=ax[0], color='dodgerblue')
sns.lineplot(x=df['date'], y=df['dailytest'].rolling(rolling_window).mean(), ax=ax[0], color='red', label='rolling mean')
sns.lineplot(x=df['date'], y=df['dailytest'].rolling(rolling_window).std(), ax=ax[0], color='orange', label='rolling std')
ax[0].set_title('Daily Test: Non-stationary \nnon-constant mean & non-constant variance', fontsize=14)
ax[0].set_ylabel(ylabel='Daily Test', fontsize=14)
ax[0].set_xlim([date(2021, 1, 1), date(2021, 5,26)])

sns.lineplot(x=df['date'], y=df['dailycases'], ax=ax[1], color='dodgerblue')
sns.lineplot(x=df['date'], y=df['dailycases'].rolling(rolling_window).mean(), ax=ax[1], color='red', label='rolling mean')
sns.lineplot(x=df['date'], y=df['dailycases'].rolling(rolling_window).std(), ax=ax[1], color='orange', label='rolling std')
ax[1].set_title('Daily Cases: Non-stationary \nvariance is time-dependent (seasonality)', fontsize=14)
ax[1].set_ylabel(ylabel='Daily Cases', fontsize=14)
ax[1].set_xlim([date(2021, 1, 1), date(2021,5,26)])

plt.tight_layout()
plt.show()

In [None]:
f, ax = plt.subplots(nrows=1, ncols=1, figsize=(20, 3))

sns.lineplot(x=df['date'], y=df['dailycases'], color='red')
ax.set_xlim([date(2020, 11, 25), date(2021, 5,26)])
plt.show()

In [None]:
from sklearn.model_selection import TimeSeriesSplit

N_SPLITS = 3

X = df['date']
y = df['dailycases']

folds = TimeSeriesSplit(n_splits=N_SPLITS)

In [None]:
f, ax = plt.subplots(nrows=N_SPLITS, ncols=2, figsize=(16, 9))

for i, (train_index, valid_index) in enumerate(folds.split(X)):
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    sns.lineplot(
        x=X_train, 
        y=y_train, 
        ax=ax[i,0], 
        color='dodgerblue', 
        label='train'
    )
    sns.lineplot(
        x=X_train[len(X_train) - len(X_valid):(len(X_train) - len(X_valid) + len(X_valid))], 
        y=y_train[len(X_train) - len(X_valid):(len(X_train) - len(X_valid) + len(X_valid))], 
        ax=ax[i,1], 
        color='dodgerblue', 
        label='train'
    )

    for j in range(2):
        sns.lineplot(x= X_valid, y= y_valid, ax=ax[i, j], color='darkorange', label='validation')
    ax[i, 0].set_title(f"Rolling Window with Adjusting Training Size (Split {i+1})", fontsize=16)
    ax[i, 1].set_title(f"Rolling Window with Constant Training Size (Split {i+1})", fontsize=16)

for i in range(N_SPLITS):
    ax[i, 0].set_xlim([date(2020, 11, 25), date(2021, 5,17)])
    ax[i, 1].set_xlim([date(2020, 11, 25), date(2021, 5,17)])
    
plt.tight_layout()
plt.show()

In [None]:
from random import randrange
from pandas import Series
from matplotlib import pyplot
from statsmodels.tsa.seasonal import seasonal_decompose
series = [i+randrange(10) for i in range(1,100)]
result = seasonal_decompose(series, model='additive', period=1)
result.plot()
pyplot.show()

In [None]:
df.head()

In [None]:
import seaborn as sns
sns.displot(df['totaltest']);

In [None]:
#scatter plot
var = 'dailytest'
data = pd.concat([df['dailycases'], df[var]], axis=1)
data.plot.scatter(x=var, y='dailycases', ylim=(0,100000));

In [None]:
#correlation matrix
corrmat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

In [None]:
#scatterplot
sns.set()
cols = ['dailytest', 'dailycases', 'dailysick', 'dailydeaths', 'dailyrecoveries']
sns.pairplot(df[cols], size = 2.5)
plt.show();

In [None]:
#missing data (percent)
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(9)

In [None]:
#histogram and normal probability plot
from scipy import stats
sns.displot(df['dailysick']);
fig = plt.figure()
res = stats.probplot(df['dailysick'], plot=plt)

...