## Kernel: `Fund-d4`

## Day 4 - Tutorial 2: Time Series Analysis 

For this exercise we will be using a CCS dataset which consist of monthly CO2 emissions measured at a well location (Synthetic data)

In [0]:
import pandas as pd
import matplotlib.pyplot as plt

import dataiku
from dataiku import pandasutils as pdu

## Step 1: Data Preparation

In [0]:
# Import the co2 dataset

mydataset = dataiku.Dataset("co2")
co2_data = mydataset.get_dataframe()


# Make sure that the column 'Prod Date' is treated as date




# Set the column 'date' as index



In [0]:
# Explore the data 'co2_data'



In [0]:
# Create a line plot of the co2 data ('co2_data') per month 

co2_data.plot(figsize=(12, 4))
plt.xlabel("Date")
plt.ylabel("CO2 Emissions")
plt.show()

## Step 2: Create Lag Variables



In [0]:
# Create three lag variables using the .shift() function

co2_data.sort_values('date', inplace=True)
co2_data['co2_lag_1'] = co2_data['co2'].shift(1)
co2_data['co2_lag_2'] = co2_data['co2'].shift(2)
co2_data['co2_lag_3'] = co2_data['co2'].shift(3)

co2_data

# Note thet it shifts the 'co2' column down by a certain number

In [0]:
# We can define a function to create multiple lag variables 

def reformat(co2_data, lag_first=1, lag_last=3, l_=[]):
    ddf = co2_data.copy()
    for c in l_:
        for i in range(lag_first, lag_last + 1):
            ddf["{}_lag_{}".format(c, i)] = ddf[c].shift(i)
    return ddf


In [0]:
# Apply the reformat function to the co2_data and call the new dataframe 'co2_data_reformat'

co2_data_reformat= reformat(co2_data,1,24,["co2"])


In [0]:
# Display the new dataframe 'co2_data_reformat'



In [0]:
# Remove the rows containing null values from the 'co2_data_reformat' dataset using the .dropna() function



# Display the edited dataframe 'co2_data_reformat'



## Step 3: Generate ML Models

In [0]:
# Define the features and target variables

features= co2_data_reformat.drop(["co2"],axis=1)

target= co2_data_reformat["co2"]

In [0]:
# Define the number of test months 'Ntest'

Ntest = 12

# Define the test and train sets

features_train, target_train = features[:-Ntest], target[:-Ntest]

features_test, target_test = features[-Ntest:], target[-Ntest:]

In [0]:
import xgboost as xgb

# Train a XGBoost regression model

xg = xgb.XGBRegressor()
xg.fit(features_train, target_train)


# Evaluate the 'xg' model

xg.score(features_test, target_test)

In [0]:
from sklearn.linear_model import LinearRegression

# Train a linear regression mnodel


# Evaluate the 'lr' model



In [0]:
from sklearn.ensemble import RandomForestRegressor

# Train a random forest model




# Evaluate the 'rf' model



## Step 4: Compare the ML models

In [0]:
# Prepare the data for visualization

co2_data_reformat_=reformat(co2_data,1,24,["co2"]).drop(["co2"],axis=1)
valids=co2_data_reformat_.notna().all(axis=1)

co2_data.loc[valids, 'LR_1step'] = lr.predict(features)
co2_data.loc[valids, 'RF_1step'] = rf.predict(features)
co2_data.loc[valids, 'XGB_1step'] = xg.predict(features)


In [0]:
# Visualize the predictions from each model on the train and test sets

co2_data[['co2', 'LR_1step','RF_1step','XGB_1step']].plot(figsize=(12, 4))

plt.show()

In [0]:
# Visualize the predictions of each model on the test set

fig,ax=plt.subplots(1,1)
co2_data[['co2']].plot(figsize=(12, 4),ax=ax);

co2_data[-Ntest:][['LR_1step','RF_1step','XGB_1step']].plot(figsize=(12, 4),ax=ax);
plt.show()
