In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date, timedelta
import warnings

from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.linear_model import LinearRegression

from scipy import stats

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

warnings.filterwarnings('ignore')
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### select tested positive and calculate percentage positive and plot data remove outliers and subtract 14 days

In [None]:
df_confirmed = pd.read_csv('/kaggle/input/coronavirusdataset/Time.csv')  
df_confirmed['percentage_positive'] = df_confirmed['confirmed']/df_confirmed['test']
df_confirmed['date'] =  pd.to_datetime(df_confirmed['date'])
df_confirmed['date_delta'] = (df_confirmed['date'] - df_confirmed['date'].min())  / np.timedelta64(1,'D')
df_confirmed = df_confirmed[df_confirmed['percentage_positive'] < 0.05] #remove outliers

df_confirmed["date_14"] = ""
for index, row in df_confirmed.iterrows():
    day_retract = row['date'] - timedelta(14)
    df_confirmed['date_14'][index] = day_retract

In [None]:
plt.title("percentage confirmed")
sns.regplot(x="date_delta", y="percentage_positive", data=df_confirmed)

In [None]:
plt.title("number of tests")
sns.regplot(x="date_delta", y="test", data=df_confirmed)

### normalize data and test for normality

In [None]:
sns.displot(data=df_confirmed, x="percentage_positive", bins=25, kde=True)

df_confirmed['percentage_positive_sqrt'] = np.sqrt(df_confirmed['percentage_positive'])
sns.displot(data=df_confirmed, x="percentage_positive_sqrt", bins=25, kde=True)

# null hypothesis: x comes from a normal distribution
k2, p = stats.normaltest(df_confirmed[['percentage_positive']])
k2, p2 = stats.normaltest(df_confirmed[['percentage_positive_sqrt']])
print(p)
print(p2)

### select weather data and calculate means for each day, then add week numbers to each row

In [None]:
df_weather = pd.read_csv('/kaggle/input/coronavirusdataset/Weather.csv')  

df_weeks_mean = pd.DataFrame(columns = ['date', 'mean_avg_temp', 'mean_min_temp', 'mean_max_temp', 'mean_precipitation', 'mean_max_wind_speed', 'mean_most_wind_direction', 'mean_avg_relative_humidity', 'day', 'week'])

mean = df_weather.groupby(['date'])['avg_temp'].mean()
df_weeks_mean['date'] = mean.index
df_weeks_mean['mean_avg_temp'] = mean.values
df_weeks_mean['date'] =  pd.to_datetime(df_weeks_mean['date'])
df_weeks_mean['day'] = (df_weeks_mean['date'] - df_weeks_mean['date'].min())  / np.timedelta64(1,'D')

mean = df_weather.groupby(['date'])['min_temp'].mean()
df_weeks_mean['mean_min_temp'] = mean.values

mean = df_weather.groupby(['date'])['max_temp'].mean()
df_weeks_mean['mean_max_temp'] = mean.values

mean = df_weather.groupby(['date'])['precipitation'].mean()
df_weeks_mean['mean_precipitation'] = mean.values

mean = df_weather.groupby(['date'])['max_wind_speed'].mean()
df_weeks_mean['mean_max_wind_speed'] = mean.values

mean = df_weather.groupby(['date'])['most_wind_direction'].mean()
df_weeks_mean['mean_most_wind_direction'] = mean.values

mean = df_weather.groupby(['date'])['avg_relative_humidity'].mean()
df_weeks_mean['mean_avg_relative_humidity'] = mean.values

for index, row in df_weeks_mean.iterrows():
    week = row['day']//7
    df_weeks_mean['week'][index] = week

### merge percentage positive dataframe and weather dataframe

In [None]:
df_confirmed_merge = df_confirmed[['date', 'date_14', 'percentage_positive_sqrt']]
df_total = df_confirmed_merge.merge(df_weeks_mean, on='date')

### plot distributions

In [None]:
sns.scatterplot(x="date_14", y="mean_avg_temp", size="percentage_positive_sqrt", sizes=(20, 200), hue="percentage_positive_sqrt", data=df_total)
plt.title("average temperature over time vs percentage positive")

In [None]:
plot_df = df_total.drop(['week', 'date'], axis = 1)
sns.pairplot(plot_df, x_vars="date_14", hue="percentage_positive_sqrt")

In [None]:
plot_df = df_total.drop(['week', 'date'], axis = 1)
sns.pairplot(plot_df, hue="percentage_positive_sqrt")
plt.title("pairplot of weather")

In [None]:
sns.lmplot(x="mean_avg_temp", y="percentage_positive_sqrt", hue="week", data=df_total);
plt.title("percentage positive compared to mean average temperature per week")

### create functions for linear regression and t-test

In [None]:
def create_linear_regression_df(column):
    grouped = df_total.groupby(['week'])
    lin_reg_df = pd.DataFrame(columns=[column, 'slope', 'type'])
    lin_reg_14_df = pd.DataFrame(columns=[column, 'slope', 'type'])
    lin_reg_sub_df = pd.DataFrame(columns=[column, 'slope'])
    
    q = 0
    for name, group in grouped:
        i = 1
        for index, row in group.iterrows():
            group.loc[index, 'day'] = i
            i = i + 1
        mean_week = group[[column]].mean().values
        mean_week = mean_week.tolist()[0]
        
        reg = LinearRegression().fit(group[['day']], group[['percentage_positive_sqrt']]) 
        x = reg.coef_[0].tolist()[0]
        
        lin_reg_df = lin_reg_df.append({column: mean_week, 'slope': x, 'type': "normal"}, ignore_index=True)
        if q > 1:
            lin_reg_14_df = lin_reg_14_df.append({column: mean_week, 'slope': x, 'type': "14_days_later"}, ignore_index=True)
        
        q = q + 1
        
    lin_reg_df.drop(lin_reg_df.tail(2).index,inplace=True) 
    lin_reg_sub_df['slope'] = lin_reg_14_df['slope'] - lin_reg_df['slope']
    lin_reg_sub_df[column] = lin_reg_df[column]
    
    return [lin_reg_df, lin_reg_14_df, lin_reg_sub_df]

In [None]:
def paired_t_test(lin_reg_df, lin_reg_14_df):
    p = stats.ttest_rel(lin_reg_df,lin_reg_14_df)[1]
    if p > 0.05:
        p = "p-value: " + str(p) + ", null hypothesis can not be rejected"
    else:
        p = "p-value: " + str(p) + ", null hypothesis can be rejected"
    return p

### No difference in slopes confirmed weeks vs two weeks later

In [None]:
lin_reg_df = create_linear_regression_df("mean_avg_temp") 
print(paired_t_test(lin_reg_df[0]['slope'], lin_reg_df[1]['slope']))

### Effect of weather on regression of percentage positive after two weeks

In [None]:
sns.lmplot(x="day", y="percentage_positive_sqrt", data=df_total);
plt.title("Percentage positive")

In [None]:
lin_reg_df = create_linear_regression_df("mean_avg_temp") 

sns.lmplot(x="mean_avg_temp", y="slope", data=lin_reg_df[2]);
plt.title("average temperature")

In [None]:
lin_reg_df = create_linear_regression_df("mean_min_temp") 

sns.lmplot(x="mean_min_temp", y="slope", data=lin_reg_df[2]);
plt.title("minimum temperature")

In [None]:
lin_reg_df = create_linear_regression_df("mean_max_temp") 

sns.lmplot(x="mean_max_temp", y="slope", data=lin_reg_df[2]);
plt.title("maximum temperature")

In [None]:
lin_reg_df = create_linear_regression_df("mean_precipitation") 

sns.lmplot(x="mean_precipitation", y="slope", data=lin_reg_df[2]);
plt.title("precipitation")

In [None]:
lin_reg_df = create_linear_regression_df("mean_max_wind_speed") 

sns.lmplot(x="mean_max_wind_speed", y="slope", data=lin_reg_df[2]);
plt.title("maximum wind speed")

In [None]:
lin_reg_df = create_linear_regression_df("mean_most_wind_direction") 

sns.lmplot(x="mean_most_wind_direction", y="slope", data=lin_reg_df[2]);
plt.title("most wind direction")

In [None]:
lin_reg_df = create_linear_regression_df("mean_avg_relative_humidity") 

sns.lmplot(x="mean_avg_relative_humidity", y="slope", data=lin_reg_df[2]);
plt.title("mean average relative humidity")

### k-means

In [None]:
df_total_kmeans = df_total.drop(['date','week','day', 'date_14'], axis = 1)
print(df_total.isnull().sum())

T = preprocessing.Normalizer().fit_transform(df_total_kmeans)
n_clusters = 3
kmean_model = KMeans(n_clusters=n_clusters)
kmean_model.fit(T)
centroids, labels = kmean_model.cluster_centers_, kmean_model.labels_

#print(centroids)
#print(labels)

pca_model = PCA(n_components=2)
pca_model.fit(T) # fit the model
T = pca_model.transform(T) # transform the 'normalized model'
# transform the 'centroids of KMean'
centroid_pca = pca_model.transform(centroids)
# print(centroid_pca)

colors = ['blue', 'red', 'green']
#colors = ['blue', 'red', 'green', 'orange', 'black', 'brown']
features_colors = [ colors[labels[i]] for i in range(len(T)) ]

plt.scatter(T[:, 0], T[:, 1],
            c=features_colors, marker='o',
            alpha=0.4
        )

plt.scatter(centroid_pca[:, 0], centroid_pca[:, 1],
            marker='x', s=100,
            linewidths=3, c=colors
        )

xvector = pca_model.components_[0] * max(T[:,0])
yvector = pca_model.components_[1] * max(T[:,1])
columns = df_total_kmeans.columns

for i in range(len(columns)):
    # plot arrows
    plt.arrow(0, 0, xvector[i], yvector[i],
                color='b', width=0.0005,
                head_width=0.002, alpha=0.75
            )
    # plot name of features
    plt.text(xvector[i], yvector[i], list(columns)[i], color='b', alpha=0.75, size="x-small")

plt.show()