In [2]:
import pandas as pd;
from pathlib import Path;
import matplotlib.pyplot as plt;
import numpy as np;

**Função 01 - Leitura do Dataset**

In [3]:
def read_dataset(folder, file, date_col=None):
    '''
    folder: é um caminho 
    '''
    df = pd.read_csv(folder / file, index_col=date_col, parse_dates=[date_col])
    return df

**Função 02 - Plot_dfs**

In [4]:
def plot_dfs(df1, df2, col, title=None, xlabel=None, ylabel=None):
    df_missing = df2.rename(columns = {col: 'missing'});
    
    columns = df_missing.loc[: , 'missing':].columns.tolist();
    subplots_size = len(columns);
    fig, ax = plt.subplots(subplots_size+1, 1, sharex=True);
    plt.subplots_adjust(hspace=0.25);
    fig.suptitle = title;
    
    df1[col].plot(ax=ax[0], figsize=(10,12));
    ax[0].set_title('Dataset Original');
    ax[0].set_xlabel(xlabel);
    ax[0].set_ylabel(ylabel);
    
    for i, colname in enumerate(columns):
        df_missing[colname].plot(ax=ax[i+1]);
        ax[i+1].set_title(colname.upper());
    plt.show();

**Função 03 - rmse_score**

In [6]:
def rmse_score(df1, df2, col=None):
    '''
        df1: Dataframe original, sem dados faltantes;
        df2: Dataframe com os dados ausentes;
        col: nome das colunas que contém dados faltantes;
        tipo de retorno: será retornado uma lista com os scores dos dataframes;
    '''
    df_missing = df2.rename(columns={col: 'missing'});
    columns = df_missing.LOC[: 'missing' : ].columns.tolls();
    scores = [];
    for comp_col in columns[1:]:
        rmse = np.sqrt(np.mean((df1[col] - df_missing[comp_col]))**2);
        scores.append(rmse);
        print(f'Cálculo do RMSE para {comp_col} : {rmse}');
    return scores; 
        

**Iniciando a verificação dos dados.**

In [10]:
co2 = Path('C:\git\cesar-school\series_temporais\Time-Series-Analysis-with-Python-Cookbook-main\datasets\Ch7\co2_missing.csv');
ecom = Path('C:\git\cesar-school\series_temporais\Time-Series-Analysis-with-Python-Cookbook-main\datasets\Ch7\clicks_missing_multiple.csv');
co2_df = pd.read_csv(co2, index_col='year', parse_dates=True);
ecom_df = pd.read_csv(ecom, index_col='date', parse_dates=True);
ecom_df.head()

Unnamed: 0_level_0,price,location,clicks
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008-04-01,43.155647,2.0,18784
2008-04-02,43.079056,1.0,24738
NaT,43.842609,,15209
NaT,,1.0,14018
NaT,43.941176,1.0,11974


In [11]:
co2_df.isna().sum()

co2    25
dtype: int64

In [12]:
ecom_df.isnull().sum()

price        1
location     1
clicks      14
dtype: int64

In [13]:
ecom_df.isnull().sum().sum()

16

In [14]:
co2_df[190:195]

Unnamed: 0_level_0,co2
year,Unnamed: 1_level_1
1985-01-01,
1986-01-01,
1987-01-01,
1988-01-01,4.2953
1989-01-01,4.2782


In [15]:
ecom_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 135 entries, 2008-04-01 to 2008-08-13
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   price     134 non-null    float64
 1   location  134 non-null    float64
 2   clicks    121 non-null    object 
dtypes: float64(2), object(1)
memory usage: 4.2+ KB


In [16]:
co2_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 226 entries, 1750-01-01 to 2020-01-01
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   co2     201 non-null    float64
dtypes: float64(1)
memory usage: 3.5 KB


In [18]:
co2_df.describe(include='all', datetime_is_numeric=True)

Unnamed: 0,co2
count,201.0
mean,1.590015
std,1.644182
min,0.0
25%,0.0764
50%,0.9351
75%,2.8076
max,4.9079


*Convertendo os valores 0 e ? para typos NAN.* 

In [19]:
co2_df.replace(0, np.NaN, inplace=True);
ecom_df.replace('?', np.NaN, inplace=True);
ecom_df['clicks']= ecom_df['clicks'].astype('float');

In [20]:
co2_df.isnull().sum()

co2    35
dtype: int64

In [21]:
co2_df.shape

(226, 1)

In [22]:
co2_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 226 entries, 1750-01-01 to 2020-01-01
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   co2     191 non-null    float64
dtypes: float64(1)
memory usage: 3.5 KB


In [23]:
co2_df[190:195]


Unnamed: 0_level_0,co2
year,Unnamed: 1_level_1
1985-01-01,
1986-01-01,
1987-01-01,
1988-01-01,4.2953
1989-01-01,4.2782


In [24]:
co2_df.iloc[132:139]

Unnamed: 0_level_0,co2
year,Unnamed: 1_level_1
1927-01-01,
1928-01-01,
1929-01-01,
1930-01-01,
1931-01-01,
1932-01-01,
1933-01-01,


In [25]:
co2_df.isnull().sum().sum()

35