In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
from plotly.offline import init_notebook_mode, iplot, plot
import plotly as py
init_notebook_mode(connected=True)
import plotly.graph_objs as go

# word cloud library
from wordcloud import WordCloud

# matplotlib
import matplotlib.pyplot as plt
import os
print(os.listdir("../input"))

In [None]:
df = pd.read_csv('../input/omicron-covid19-variant-daily-cases/covid-variants.csv')
df.info()

## Tracking the progression of the new omicron COVID-19 variant
The data
location- this is the country for which the variants information is provided;
* date - date for the data entry;
* variant - this is the variant corresponding to this data entry;
* num_sequences - the number of sequences processed (for the country, variant and date);
* perc_sequences - percentage of sequences from the total number of sequences (for the country, variant and date);
* numsequencestotal - total number of sequences (for the country, variant and date);


In [None]:
df.head(10)

# Encoding categorical features

Features that contain non numerical data like characters need to be encoded to represent data statistically. This step is performed during the data preprocessing stage.

1. Label Encoding Features

    * Labelling each data point in dataframe feature as number or alphabet manually using dictionaries / lists/ arrrays or using sklearn encoder.
    * Ex: Dictionary('A':1,'B':4,'Teddy': 300, 'Valentine':598)

2. One hot encoding

   *  A one hot encoding is a representation of categorical variables as binary vectors.This first requires that the categorical values be mapped to integer values.Then, each integer value is represented as a binary vector that is all zero values except the index of the integer, which is marked with a 1.

   * In this example, we will assume the case where you have an output sequence of the following 3 labels:

            "cold"
            "warm"
            "hot"

    * An example sequence of 10 time steps may be:

        [cold, cold, warm, cold, hot, hot, warm, cold, warm, hot]

        This would first require an integer encoding, such as 1, 2, 3. This would be followed by a one hot encoding of integers to a binary vector with 3 values, such as [1, 0, 0].

        The sequence provides at least one example of every possible value in the sequence. Therefore we can use automatic methods to define the mapping of labels to integers and integers to binary vectors.

Ref: https://www.analyticsvidhya.com/blog/2020/08/types-of-categorical-data-encoding/
   



In [None]:
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
le = LabelEncoder()
df['loc_n'] = le.fit_transform(df['location'])
sns.set(style = 'darkgrid')
sns.countplot(df['loc_n'])

In [None]:

df['variant_n'] = le.fit_transform(df['variant'])


# Feature engineering

In exploratory data analysis, we often would like to analyze data by some categories. In SQL, the GROUP BY statement groups row that has the same category values into summary rows. In Pandas, SQL’s GROUP BY operation is performed using the similarly named groupby() method. Pandas’ groupby() allows us to split data into separate groups to perform computations for better analysis.

Read more about feature engineering here: [Grouping data using pandas](https://towardsdatascience.com/all-pandas-groupby-you-should-know-for-grouping-data-and-performing-operations-2a8ec1327b5)

In [None]:
gk = df.groupby('variant')
gk.first(5)

In [None]:
# Finding values contained in Delta group
gk.get_group('Delta')

In [None]:
# group by more than one category

gkk = df.groupby(['variant','num_sequences_total'])
gkk.first(5)

groupby() is a very powerful function with a lot of variations. It makes the task of splitting the dataframe over some criteria really easy and efficient.

# What is the count of variants in different locations ?

You call .groupby() and pass the name of the column you want to group on, which is "location". Then, you use ["variant"] to specify the columns on which you want to perform the actual aggregation.



In [None]:
# group by , count 

v_by_seq = df.groupby('location')['variant'].count()
v_by_seq.head(10)

In [None]:
# using nlargest() function will get the 
# largest values of Variant sequences
df.groupby('variant')['perc_sequences'].nlargest().head(5)

You can pass multiple parameters into groupby. Just like below

# Distribution of different Variants 


In [None]:
# Distribution of different Variants 
df.groupby('variant',sort=False).sum()

In [None]:
df.groupby(['location','perc_sequences'])['variant'].count()

# Line Chart

In [None]:


import plotly.graph_objs as g
l = df.iloc[:200,:]
s = g.Scatter(x = l.variant, y = l.perc_sequences,
             mode = 'lines+markers',name = 'Variants',
             text = l.variant,fillcolor = 'red')


data = [s]
layout = dict(title = 'Percentage of Variants')
fig = dict(data = data, layout = layout)
iplot(fig)

Observation:

1. The percentage of Delta and other viruses have sharp increase.
2. The S pelican variant cases would be third highest virus variant, which would spread faster.


# Bar Charts

Virus spread across the years

In [None]:
import matplotlib.pyplot as plt
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
df['day'] = pd.DatetimeIndex(df['date']).day

print('Years in dataset',np.unique(df['year']))
sns.set()
df['year'].plot()
plt.xlabel('Cases')
plt.ylabel('Year')
plt.title('Virus spread across the years')

Observation:

* The number of cases increases very fast across equal time periods during 2020, 2021. 
* In 2022 and 2021, the virus spreads rapidly only during certain time periods. The virus impact is not same in all months.

In [None]:
# Consider data from 2021
df2021 = df[df.year==2020].iloc[:100,:]
b1 = g.Bar(x = df2021.variant, y = df.perc_sequences,
          name = 'Variant Percentage Sequences',
           marker = dict(color = 'rgba(255, 174, 255, 0.5)',
        line=dict(color='rgb(0,0,0)',width=1.5)))
b2 = g.Bar(x = df2021.variant, y = df.num_sequences_total,
          name = 'Variant total sequences',
          marker = dict(color = 'rgba(255, 255, 128, 0.5)',
                              line=dict(color='rgb(0,0,0)',width=1.5)))
data = [b1,b2]
layout = g.Layout(barmode = 'group')
fig = g.Figure(data = data, layout = layout)
iplot(fig)

 
<span style="color:blue;font-size:2em;">*Let's create some 3D plots that allow us to view data in multi dimensional space..*; </span>

<font size = "4"><br>🪄🛑 🪄 🛑 🪄🛑  *1, 2, 3, ...... Ta da Exploring Plotly * 🪄🛑 🪄🛑 🪄🛑 </font>

<img src = 'https://media.giphy.com/media/26FPAn6hPp6Fqx7qw/giphy.gif'>

# 3D Plots

Location wise cases spread

In [None]:
import plotly.express as px

# Location wise cases spread
fig = px.scatter_3d(df, x='location', y='perc_sequences', z = 'num_sequences_total' , 
                   color = 'location')
fig.show()

In [None]:
fig = px.scatter_3d(df, x='variant', y='perc_sequences', z = 'variant_n' , 
                   color = 'variant',symbol = 'variant')
fig.show()

# 3D Surface plots

Virus spread

In [None]:
v_data = df.groupby('variant',sort=False).sum()

v_data.head(10)

In [None]:
fig = g.Figure(data = [g.Surface(z = v_data.values)])
fig.update_traces(contours_z=dict(show=True, usecolormap=True,
                                  highlightcolor="limegreen", project_z=True))
fig.update_layout(title = 'Variants 3D surface plot',autosize=True,
                 width = 600, height = 800,
                 margin = dict(l = 65, r=50, b=65, t=90))
fig.show()

# 3D line plots

Spread of virus from 2020 to 2022

In [None]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np

rs = np.random.RandomState()
rs.seed(0)

def brownian_motion(T = 1, N = 100, mu = 0.1, sigma = 0.01, S0 = 20):
    dt = float(T)/N
    t = np.linspace(0, T, N)
    W = rs.standard_normal(size = N)
    W = np.cumsum(W)*np.sqrt(dt) # standard brownian motion
    X = (mu-0.5*sigma**2)*t + sigma*W
    S = S0*np.exp(X) # geometric brownian motion
    return S

dates = pd.date_range(l.date.min(), l.date.max())
T = (dates.max()-dates.min()).days / 365
N = dates.size
start_price = 100
y = brownian_motion(T, N, sigma=0.1, S0=start_price)
z = brownian_motion(T, N, sigma=0.1, S0=start_price)

fig = go.Figure(data=go.Scatter3d(
    x=dates, y=y, z=z,
    marker=dict(
        size=4,
        color=z,
        colorscale='Viridis',
    ),
    line=dict(
        color='darkblue',
        width=2
    )
))

fig.update_layout(
    width=800,
    height=700,
    autosize=False,
    scene=dict(
        camera=dict(
            up=dict(
                x=0,
                y=0,
                z=1
            ),
            eye=dict(
                x=0,
                y=1.0707,
                z=1,
            )
        ),
        aspectratio = dict( x=1, y=1, z=0.7 ),
        aspectmode = 'manual'
    ),
)

fig.show()



<img src="https://media.giphy.com/media/kfR5iyQgmq7PoiFTAf/giphy.gif">

# Upvote

<img src='https://media.giphy.com/media/wKzqKQt1Xhyv069mmY/giphy.gif'>

Hey Kaggler !! If you liked my notebook. Please upvote. 

