In [None]:
"""
Comparing correlation coeff for dataset interpolation.
"""

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
#getting data
data = pd.read_parquet("files/weather-predict.parquet")
data.head()

Unnamed: 0_level_0,Pressure,Pressure+24h
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2006-04-01 00:00:00,1015.13,1015.68
2006-04-01 01:00:00,1015.63,1015.41
2006-04-01 02:00:00,1015.94,1014.98
2006-04-01 03:00:00,1016.41,1015.18
2006-04-01 04:00:00,1016.51,1014.7


In [5]:
data.shape

(96418, 2)

In [6]:
# preparing data
data.dtypes

Pressure        float64
Pressure+24h    float64
dtype: object

In [8]:
data.isna().sum()

Pressure         0
Pressure+24h    38
dtype: int64

In [9]:
(data == 0).sum()

Pressure        1288
Pressure+24h    1288
dtype: int64

In [10]:
data.corr()

Unnamed: 0,Pressure,Pressure+24h
Pressure,1.0,0.419074
Pressure+24h,0.419074,1.0


In [11]:
dataset = data[data["Pressure+24h"] != 0]
dataset = dataset.dropna()

In [14]:
#checking lost data
len(data), len(dataset), len(data)-len(dataset)

(96418, 95092, 1326)

In [15]:
dataset.corr()

Unnamed: 0,Pressure,Pressure+24h
Pressure,1.0,0.083047
Pressure+24h,0.083047,1.0


In [16]:
dataset.replace(0, np.nan).interpolate().corr()

Unnamed: 0,Pressure,Pressure+24h
Pressure,1.0,0.79447
Pressure+24h,0.79447,1.0


In [17]:
# linear regression
def regression_score(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2, random_state=42)
    lin = LinearRegression()
    lin.fit(X_train, y_train)
    y_pred = lin.predict(X_test)
    return r2_score(y_test, y_pred)

In [18]:
regression_score(dataset[["Pressure"]], dataset["Pressure+24h"])

0.008080860028906622

In [19]:
dataset_interpolated = dataset.replace(0, np.nan).interpolate()

regression_score(dataset_interpolated[["Pressure"]], dataset_interpolated["Pressure+24h"])

0.6269601274081953