## Index

## Importing Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [488]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.tsa.arima_model import ARIMA

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import statsmodels.api as sm

from datetime import datetime, timedelta

from pymongo import MongoClient

## Reading data from MongoDB

In [3]:
client = MongoClient("mongodb://localhost:27017")

In [4]:
# using the find method without a filter to read all records

results1 = client['DAP_Project_DB']['co2_tb'].find()

results2 = client['DAP_Project_DB']['nitrous_oxide_tb'].find()

results3 = client['DAP_Project_DB']['methane_tb'].find()

In [5]:
type(results1)

pymongo.cursor.Cursor

In [6]:
# storing results in dataframe as results is pymongo.cursor.Cursor

df_co2 = pd.DataFrame(results1)
df_n2o = pd.DataFrame(results2)
df_methane = pd.DataFrame(results3)

# removing the 1st column
df_co2 = df_co2.iloc[:, 1:]
df_n2o = df_n2o.iloc[:, 1:]
df_methane = df_methane.iloc[:, 1:]


In [7]:
df_co2.shape, df_n2o.shape, df_methane.shape

((3961, 5), (256, 5), (466, 5))

In [8]:
# CO2 
df_co2.head()

Unnamed: 0,year,month,day,cycle,trend
0,2013,2,6,396.08,394.59
1,2013,2,7,396.1,394.59
2,2013,2,8,396.12,394.6
3,2013,2,9,396.14,394.61
4,2013,2,10,396.16,394.62


In [9]:
# N2O
df_n2o.head()

Unnamed: 0,date,average,trend,averageUnc,trendUnc
0,2002.5,316.85,316.88,0.14,0.13
1,2002.6,316.83,316.92,0.14,0.13
2,2002.7,316.82,316.95,0.14,0.14
3,2002.8,316.82,316.99,0.14,0.14
4,2002.9,316.87,317.03,0.14,0.14


In [10]:
# methane
df_methane.head()

Unnamed: 0,date,average,trend,averageUnc,trendUnc
0,1984.11,1653.82,1649.98,0.96,0.58
1,1984.12,1656.19,1651.07,1.06,0.58
2,1985.1,1655.58,1652.15,0.96,0.58
3,1985.2,1652.25,1653.16,1.36,0.58
4,1985.3,1654.61,1654.16,1.0,0.58


## Data Description

#### Carbon-di-Oxide (Chemical Name: CO2)
- year, month, day: Data is recorded everday.


- Cycle: This is the recorded level of CO2 gas on the specified date. The unit of measurement (parts per million).


- Trend: The term 'trend' refers to the long-term, systematic pattern or direction. It tells pattern in the CO2 levels that is not due to short-term fluctuations.

#### Interpretation:
- We will use the "Cycle" column to understand the recorded level of CO2 gas on a specific date.
- The "Trend" column provides information about the long-term pattern or direction in the CO2 levels.

#### Nitrous-Oxide (Chemical Name: N2O)  and  Methane (Chemical Name: CH4)
- Date: Data is recorded on a monthly basis.


- Average: This field is the average concentration of N2O & Methane gases recorded on the specified date. The unit of measurement (parts per million)


- Trend: This field is the trend component of the N2O & Methane gas levels on the specified date. The term 'trend' refers to the long-term, systematic pattern or direction. It tells pattern in the NO2 & Methane levels that is not due to short-term fluctuations.


- AverageUnc: This field is the uncertainty or error associated with the average concentration of N2O & Methane gases. It is margin of error in the average value.


- TrendUnc: This field is the uncertainty or error associated with the trend component. It is margin of error in the trend value.

## Exploratory Data Analysis

<div class="alert alert-block alert-warning">
<b>Note:</b> 

- Before proceeding to EDA, we could combine the three gases into one dataframe.
- But there are few issues: 
    - there is no common key
    - co2 contains year, month, day whereas n2o, methane has just the date column with month values.
- If we concat the data, we need to filter the data for 3 gases everytime for any transformation we perform.


- We keep the dataframes separate and perform the analysis.
- Any transformation required, we check for specific gases and perform transformation individually.
- This will enable to QC the data without any hassle.
    
</div>

In [11]:
# storing dataframes in a dictionary, 
# if there is common transformation. We'll loop through this dictionary of dataframes quickly.

df_gases = {'CO2': df_co2, 'Nitrous_oxide': df_n2o, 'Methane': df_methane}

In [12]:
# checking shape of the dataframes

for i, j in df_gases.items():
    print(i)
    print('The number of rows:', j.shape[0])
    print('The number of columns:', j.shape[1])
    print('--')

CO2
The number of rows: 3961
The number of columns: 5
--
Nitrous_oxide
The number of rows: 256
The number of columns: 5
--
Methane
The number of rows: 466
The number of columns: 5
--


In [13]:
# info(), checking features and its datatypes

for i, j in df_gases.items():
    print(i)
    j.info()
    print('----')

CO2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3961 entries, 0 to 3960
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   year    3961 non-null   object
 1   month   3961 non-null   object
 2   day     3961 non-null   object
 3   cycle   3961 non-null   object
 4   trend   3961 non-null   object
dtypes: object(5)
memory usage: 154.9+ KB
----
Nitrous_oxide
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        256 non-null    object
 1   average     256 non-null    object
 2   trend       256 non-null    object
 3   averageUnc  256 non-null    object
 4   trendUnc    256 non-null    object
dtypes: object(5)
memory usage: 10.1+ KB
----
Methane
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466 entries, 0 to 465
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dt

#### Observation:

- From the above observations of records, all the features are numerical datatype but stored in datatype object.
- We will convert date, year, month, day columns in respetive datetime datatype.
- Other features, we'll convert them in numerical datatype.

<div class="alert alert-block alert-info">
<b>Note:</b> 
    
- We will pre-process, transform, clean data wherever necessary. 

- This will help us in the data visualization part.
</div>

In [14]:
# copying the dataframe before transforming data
# incase we need to revert back

df2_co2 = df_co2.copy()

df2_n2o = df_n2o.copy()

df2_methane = df_methane.copy()

In [15]:
df2_gases = {'CO2': df2_co2, 'Nitrous_oxide': df2_n2o, 'Methane': df2_methane}

## Data Pre-processing

#### CO2:

In [16]:
# step 1: filling 0 in month, day columns. For ex: 01, 02, 03 instead of 1, 2, 3
df2_co2['month'] = df2_co2['month'].apply(lambda x: str(x).zfill(2))
df2_co2['day'] = df2_co2['day'].apply(lambda x: str(x).zfill(2))

# step 2: storing it in yyyy-mm-dd format
df2_co2['date'] = df2_co2['year'] + '-' + df2_co2['month'] + '-' + df2_co2['day']

# step 3: converting to date
df2_co2['date'] = pd.to_datetime(df2_co2['date'])

In [17]:
# converting to float

df2_co2['cycle'] = df2_co2['cycle'].astype(float)
df2_co2['trend'] = df2_co2['trend'].astype(float)

In [18]:
# arranging the columns and dropping year, month, day
df2_co2 = df2_co2.loc[:, ['date', 'cycle', 'trend']]

df2_co2.head(2)

# we have clean date column in date datatype. 
# we could easily fetch year, month, day using dt.year, dt.month, dt.day

Unnamed: 0,date,cycle,trend
0,2013-02-06,396.08,394.59
1,2013-02-07,396.1,394.59


#### Nitrous oxide and Methane:

In [19]:
df2_n2o.head(2)

Unnamed: 0,date,average,trend,averageUnc,trendUnc
0,2002.5,316.85,316.88,0.14,0.13
1,2002.6,316.83,316.92,0.14,0.13


In [20]:
df2_methane.head(2)

Unnamed: 0,date,average,trend,averageUnc,trendUnc
0,1984.11,1653.82,1649.98,0.96,0.58
1,1984.12,1656.19,1651.07,1.06,0.58


- Date column contains year and month. we'll separate it out and store it in yyyy-mm format.

In [21]:
# step 1: separate out the year and month
df2_n2o[['year', 'month']] = df2_n2o['date'].str.split('.', expand=True)
df2_methane[['year', 'month']] = df2_methane['date'].str.split('.', expand=True)

# step 2: fill the month with 0 if single digit
df2_n2o['month'] = df2_n2o['month'].apply(lambda x: str(x).zfill(2))
df2_methane['month'] = df2_methane['month'].apply(lambda x: str(x).zfill(2))

# step 3: storing it in yyyy-mm format
df2_n2o['date'] = df2_n2o['year'] + '-' + df2_n2o['month']
df2_methane['date'] = df2_methane['year'] + '-' + df2_methane['month']

# step 4: converting to date
df2_n2o['date'] = pd.to_datetime(df2_n2o['date'])
df2_methane['date'] = pd.to_datetime(df2_methane['date'])

In [22]:
# no2
df2_n2o['average'] = df2_n2o['average'].astype(float)
df2_n2o['trend'] = df2_n2o['trend'].astype(float)
df2_n2o['averageUnc'] = df2_n2o['averageUnc'].astype(float)
df2_n2o['trendUnc'] = df2_n2o['trendUnc'].astype(float)

# selecting and arranging necessary columns
df2_n2o = df2_n2o.loc[:, ['date', 'average', 'trend', 'averageUnc', 'trendUnc']]

# methane
df2_methane['average'] = df2_methane['average'].astype(float)
df2_methane['trend'] = df2_methane['trend'].astype(float)
df2_methane['averageUnc'] = df2_methane['averageUnc'].astype(float)
df2_methane['trendUnc'] = df2_methane['trendUnc'].astype(float)

# selecting and arranging necessary columns
df2_methane = df2_methane.loc[:, ['date', 'average', 'trend', 'averageUnc', 'trendUnc']]

In [23]:
df2_n2o.head(2)

Unnamed: 0,date,average,trend,averageUnc,trendUnc
0,2002-05-01,316.85,316.88,0.14,0.13
1,2002-06-01,316.83,316.92,0.14,0.13


In [24]:
df2_methane.head(2)

Unnamed: 0,date,average,trend,averageUnc,trendUnc
0,1984-11-01,1653.82,1649.98,0.96,0.58
1,1984-12-01,1656.19,1651.07,1.06,0.58


- Date column is in date datatype.
- Other columns are converted to float.

### Statistical Summary

In [25]:
# statistical info of the numerical features

df2_co2.describe()

Unnamed: 0,date,cycle,trend
count,3961,3961.0,3961.0
mean,2018-07-10 00:00:00.000000256,407.658712,407.672946
min,2013-02-06 00:00:00,392.61,394.59
25%,2015-10-24 00:00:00,400.88,400.65
50%,2018-07-10 00:00:00,407.81,407.76
75%,2021-03-26 00:00:00,414.29,414.36
max,2023-12-11 00:00:00,421.52,420.54
std,,7.799347,7.617709


In [26]:
df2_n2o.describe()

Unnamed: 0,date,average,trend,averageUnc,trendUnc
count,256,256.0,256.0,256.0,256.0
mean,2012-12-15 16:30:00,325.911406,325.913047,-0.137656,-0.148242
min,2002-05-01 00:00:00,316.82,316.88,-9.99,-9.99
25%,2007-08-24 06:00:00,320.4325,320.6275,0.13,0.12
50%,2012-12-16 12:00:00,325.555,325.465,0.14,0.13
75%,2018-04-08 12:00:00,330.7,330.6875,0.14,0.1325
max,2023-08-01 00:00:00,336.75,336.94,0.16,0.15
std,,5.843039,5.840111,1.655183,1.653408


In [27]:
df2_methane.describe()

Unnamed: 0,date,average,trend,averageUnc,trendUnc
count,466,466.0,466.0,466.0,466.0
mean,2004-03-16 18:26:15.965665280,1781.388948,1781.406309,0.884528,0.459657
min,1984-11-01 00:00:00,1646.8,1649.98,-9.99,-9.99
25%,1994-07-08 18:00:00,1742.235,1743.0375,0.9,0.5425
50%,2004-03-16 12:00:00,1776.17,1774.735,1.02,0.62
75%,2013-11-23 12:00:00,1816.685,1815.9125,1.17,0.68
max,2023-08-01 00:00:00,1924.54,1926.26,1.88,0.98
std,,62.76341,62.711949,1.361514,1.295443


### Checking for Null Values

In [28]:
# checking for null values

for name, df in df2_gases.items():
    print(name)
    print(df.isna().sum())
    print('--')

CO2
year     0
month    0
day      0
cycle    0
trend    0
date     0
dtype: int64
--
Nitrous_oxide
date          0
average       0
trend         0
averageUnc    0
trendUnc      0
year          0
month         0
dtype: int64
--
Methane
date          0
average       0
trend         0
averageUnc    0
trendUnc      0
year          0
month         0
dtype: int64
--


- There are no null values in our time series data

## <font size='5' color='#1ABC9C'>Data Visualization</font>

### Current Status of CO2, Nitrous Oxide, Methane

- We will check the current trend i.e, last 12 months data to understand what is the concentration of these gases.

In [29]:
df3_co2 = df2_co2.copy()
df3_n2o = df2_n2o.copy()
df3_methane = df2_methane.copy()

df3_gases = {'CO2': df3_co2, 'Nitrous_oxide': df3_n2o, 'Methane': df3_methane}

In [31]:
# getting last 12 months data for these gases

def last_12_months(df, name):
    df = df.sort_values(by='date', ascending=False)
    
    df_max_date = max(df['date'])
    twelve_months_ago = df_max_date - timedelta(days=365)
    
    # filtering last 12 months data
    df_last_12_months = df[df['date'] >= twelve_months_ago]
    
    print(name)
    print(min(df_last_12_months['date'].dt.date), max(df_last_12_months['date'].dt.date), '\n')
    
    return df_last_12_months

In [32]:
df_last12_co2 = last_12_months(df3_co2, 'CO2')
df_last12_n2o = last_12_months(df3_n2o, 'N2O')
df_last12_methane = last_12_months(df3_methane, 'Methane')

CO2
2022-12-11 2023-12-11 

N2O
2022-08-01 2023-08-01 

Methane
2022-08-01 2023-08-01 



In [33]:
print('Concentration of gases as of latest available date:\n')

print('CO2:\n', df3_co2[df3_co2['date'].dt.date == max(df3_co2['date'].dt.date)][['date', 'cycle']].reset_index(), '\n')

print('Nitrous-Oxide:\n', df3_n2o[df3_n2o['date'].dt.date == max(df3_n2o['date'].dt.date)][['date', 'average']].reset_index(), '\n')

print('Methane:\n', df3_methane[df3_methane['date'].dt.date == max(df3_methane['date'].dt.date)][['date', 'average']].reset_index(), '\n')

Concentration of gases as of latest available date:

CO2:
    index       date   cycle
0   3960 2023-12-11  421.52 

Nitrous-Oxide:
    index       date  average
0    255 2023-08-01   336.75 

Methane:
    index       date  average
0    465 2023-08-01  1919.41 



#### As of latest availabe dates, the concentration of gases are:

- CO2: 421.52


- Nitroux-Oxide: 336.75


- Methane: 1919.41

In [493]:
# Line plot - CO2

fig = px.line(df_last12_co2, x='date', y=['cycle'], labels={'variable': 'CO2 Concentration', 'value': 'Concentration'})
fig.update_layout(title='CO2 Gas Last 12 Months - Cycle', xaxis_title='Date', yaxis_title='CO2 Concentration (ppm)')
fig.update_layout(width=800, height=400)
fig.show()

- Note: cycle variable is the average value of CO2 concentration

In [494]:
# Line plot - NO2

fig = px.line(df_last12_n2o, x='date', y=['average'], labels={'variable': 'NO2 Concentration', 'value': 'Concentration'})
fig.update_layout(title='NO2 Gas Last 12 Months - Average', xaxis_title='Date', yaxis_title='NO2 Concentration (ppm)')
fig.update_layout(width=800, height=400)
fig.show()

In [495]:
# Line plot - Methane

fig = px.line(df_last12_n2o, x='date', y=['average'], labels={'variable': 'NO2 Concentration', 'value': 'Concentration'})
fig.update_layout(title='Methane Gas Last 12 Months - Average', xaxis_title='Date', yaxis_title='Methane Concentration (ppm)')
fig.update_layout(width=800, height=400)
fig.show()

### <font color ='#2ECC71' >Inferences for the Last 12 Months</font>

- Write inferences
- current concentration as of the latest date
- at what rate the concentration is increasing since last 12 months (calcualte if you need)

## Line Plot


- The line plot shows the variation in gas levels over the dates. It helps identify any noticeable trends.

In [386]:
# Line plot - CO2

fig = px.line(df3_co2, x='date', y=['cycle'], labels={'variable': 'CO2 Concentration', 'value': 'Concentration'})
fig.update_layout(title='CO2 Levels Over Time', xaxis_title='Date', yaxis_title='CO2 Concentration (ppm)')
fig.update_layout(width=800, height=400)
fig.show()

In [38]:
# Line plot - N2O

fig = px.line(df3_n2o, x='date', y=['average'], labels={'variable': 'N2O Concentration', 'value': 'Concentration'})
fig.update_layout(title='N2O Levels Over Time', xaxis_title='Date', yaxis_title='N2O Concentration (ppm)')
fig.update_layout(width=800, height=400)
fig.show()

In [39]:
# Line plot - Methane

fig = px.line(df3_methane, x='date', y=['average'], labels={'variable': 'Methane Concentration', 'value': 'Concentration'})
fig.update_layout(title='Methane Levels Over Time', xaxis_title='Date', yaxis_title='Methane Concentration (ppm)')
fig.update_layout(width=800, height=400)
fig.show()

### <font color ='#2ECC71' >Inferences: Line Plot</font>


- Write Inferences

## Monthly Averages

- The bar chart displays the average values for each month.

In [40]:
# monthly averages

fig3 = go.Figure()
fig3.add_trace(go.Bar(x=df3_co2['date'].dt.month, y=df3_co2['cycle'], name='Average'))
# fig3.add_trace(go.Bar(x=df3_co2['date'].dt.month, y=df3_co2['trend'], name='Trend'))
fig3.update_layout(title='Monthly Averages of CO2 Levels', xaxis_title='Month', yaxis_title='CO2 Levels')
fig3.update_layout(width=800, height=400)
fig3.show()

In [41]:
# monthly averages

fig3 = go.Figure()
fig3.add_trace(go.Bar(x=df3_n2o['date'].dt.month, y=df3_n2o['average'], name='Average'))
# fig3.add_trace(go.Bar(x=df3_n2o['date'].dt.month, y=df3_n2o['trend'], name='Trend'))
fig3.update_layout(title='Monthly Averages of N2O Levels', xaxis_title='Month', yaxis_title='N2O Levels')
fig3.update_layout(width=800, height=400)
fig3.show()

In [42]:
# monthly averages

fig3 = go.Figure()
fig3.add_trace(go.Bar(x=df3_methane['date'].dt.month, y=df3_methane['average'], name='Average'))
# fig3.add_trace(go.Bar(x=df3_methane['date'].dt.month, y=df3_methane['trend'], name='Trend'))
fig3.update_layout(title='Monthly Averages of Methane Levels', xaxis_title='Month', yaxis_title='Methane Levels')
fig3.update_layout(width=800, height=400)
fig3.show()

### <font color ='#2ECC71' >Inferences: Monthly Averages</font>


- Write Inferences

In [58]:
# making a copy of dataframes we have worked so far

df4_co2 = df3_co2.copy()
df4_n2o = df3_n2o.copy()
df4_methane = df3_methane.copy()

df4_gases = {'CO2': df4_co2, 'Nitrous_oxide': df4_n2o, 'Methane': df4_methane}

In [61]:
# setting date column as index

df4_co2.set_index('date', inplace = True)
df4_n2o.set_index('date', inplace = True)
df4_methane.set_index('date', inplace = True)

In [62]:
df4_co2.head(2)

Unnamed: 0_level_0,cycle,trend
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-02-06,396.08,394.59
2013-02-07,396.1,394.59


##  Seasonal Decomposition

In [74]:
# function to plot original line plot, trend plot, seasonal plot and residual plot

def seasonal_decompose_plot(name, df, column_name, period):
    result = seasonal_decompose(df[column_name], model= 'multiplicative', period=period)
    # result = seasonal_decompose(df['average'], model= 'additive', period=12)

    # original time series
    trace_original = go.Scatter(x=df.index, y=df[column_name], mode='lines', name='Original')

    # adding trend, seasonal, residual components
    trace_trend = go.Scatter(x=result.trend.index, y=result.trend, mode='lines', name='Trend')
    trace_seasonal = go.Scatter(x=result.seasonal.index, y=result.seasonal, mode='lines', name='Seasonal')
    trace_residual = go.Scatter(x=result.resid.index, y=result.resid, mode='lines', name='Residual')
    
    fig = make_subplots(rows=4, cols=1, shared_xaxes=True, vertical_spacing=0.1,
                        subplot_titles=['Original Time Series', 'Trend', 'Seasonal', 'Residual'])

    # adding traces to the subplot
    fig.add_trace(trace_original, row=1, col=1)
    fig.add_trace(trace_trend, row=2, col=1)
    fig.add_trace(trace_seasonal, row=3, col=1)
    fig.add_trace(trace_residual, row=4, col=1)

    fig.update_layout(title=f'Seasonal Decomposition of {name} Gas', xaxis_title='Date', height=800)
    fig.show()

In [75]:
seasonal_decompose_plot('CO2', df4_co2, 'cycle', period = 365)

In [76]:
seasonal_decompose_plot('Nitrous-Oxide', df4_n2o, 'average', period = 12)  

# period = 12 as we have 12 data points, each for a month in an year

In [77]:
seasonal_decompose_plot('Methane', df4_methane, 'average', 12)

### <font color ='#2ECC71' >Inferences: Seasonal Decomposition</font>

- Write inferences
- current concentration as of the latest date
- at what rate the concentration is increasing since last 12 months (calcualte if you need)

## Hypothesis Testing

#### Dickey-Fuller Test: 

- This test is used to check whether the time series is stationary or not. 
- The test results comprise of a Test Statistic and some Critical Values for different confidence levels. 
- If the test statistic is less than the critical value, we can say that time series is stationary.

#### Defining the Hypothesis for our data:


- Null Hypothesis (H0): Time series is stationary


- Alternate Hypothesis (H1): Time series is not stationary
    

In [144]:
from statsmodels.tsa.stattools import adfuller

In [335]:
def dickey_fuller_test(df, gas):
    df = df.iloc[:, :1] # getting only the average column with date as index
    
    print(gas, ':')
    result = adfuller(df, autolag = 'AIC')
    print('Test statistic:' , result[0])
    print('p-value:', result[1])
    print('Critical Values:' , result[4], '\n')
    
    # test statistic < critical value
    if result[0] <= result[4]['5%']:
        print('Result: We Failed to Reject the Null Hypothesis (Data is Stationary)\n', '--')
    else:
        print('Result: We Reject the Null Hypothesis (Data is Not Stationary)\n', '--')

In [336]:
dickey_fuller_test(df = df4_co2, gas = 'CO2')

CO2 :
Test statistic: -1.420497579617472
p-value: 0.5724108334895555
Critical Values: {'1%': -3.432015456217749, '5%': -2.862275907411977, '10%': -2.5671617319776505} 

Result: We Reject the Null Hypothesis (Data is Not Stationary)
 --


In [337]:
dickey_fuller_test(df = df4_n2o, gas = 'N2O')

N2O :
Test statistic: 2.70094543660696
p-value: 0.9990870685689679
Critical Values: {'1%': -3.457437824930831, '5%': -2.873459364726563, '10%': -2.573122099570008} 

Result: We Reject the Null Hypothesis (Data is Not Stationary)
 --


In [338]:
dickey_fuller_test(df = df4_methane, gas = 'Methane')

Methane :
Test statistic: 0.6796599443305956
p-value: 0.9894230841198877
Critical Values: {'1%': -3.4450311708077743, '5%': -2.8680131035505023, '10%': -2.570217924306441} 

Result: We Reject the Null Hypothesis (Data is Not Stationary)
 --


### <font color ='#2ECC71' >Inferences</font>

For all the gases:
- We Reject the Null Hypothesis
- Out data is Not Stationary
- Test Statistic is not less than critical value (0.05) for all the gases


We will use Differencing Method to make our data Stationary

## Differencing Method


In [496]:
def differencing(df):
    df = df.iloc[:, :1]
    df_diff = df - df.shift()
    df_diff.dropna(inplace=True)
    df_final = df_diff.copy()

    return df_final

In [340]:
diff_co2 = differencing(df = df4_co2)

diff_n2o = differencing(df = df4_n2o)

diff_methane = differencing(df = df4_methane)

In [490]:
# checking stationary using dickey_fuller_test after differencing

dickey_fuller_test(df = diff_co2, gas = 'CO2')

dickey_fuller_test(df = diff_n2o, gas = 'N2O')

dickey_fuller_test(df = diff_methane, gas = 'Methane')

CO2 :
Test statistic: -4.319197738853352
p-value: 0.00041141042821050917
Critical Values: {'1%': -3.432015880491298, '5%': -2.862276094831294, '10%': -2.5671618317520877} 

Result: We Failed to Reject the Null Hypothesis (Data is Stationary)
 --
N2O :
Test statistic: -3.399720271908227
p-value: 0.010968411216560316
Critical Values: {'1%': -3.458128284586202, '5%': -2.873761835239286, '10%': -2.5732834559706235} 

Result: We Failed to Reject the Null Hypothesis (Data is Stationary)
 --
Methane :
Test statistic: -3.5591312151501344
p-value: 0.0065921728675626835
Critical Values: {'1%': -3.4450973903602367, '5%': -2.868042229965336, '10%': -2.570233448893} 

Result: We Failed to Reject the Null Hypothesis (Data is Stationary)
 --


### <font color ='#2ECC71' >Inferences</font>

- We checked test statistic with 5% critical value
- Test Statistic is less than 5% critical value
- We can say with 95% confidence that our data is Stationary for all the gases.

### Original Line Plot and Line Plot After Differencing

In [387]:
def original_and_difference(df_original, df_diff, column_name, gas_name):
    fig = make_subplots(rows=1, cols=2, subplot_titles=['Original', 'After Differencing'])

    # original
    fig.add_trace(go.Scatter(x=df_original.index, y=df_original[column_name], mode='lines', name='Original'), row=1, col=1)
    fig.update_xaxes(title_text='Date', row=1, col=1)
    fig.update_yaxes(title_text='Value', row=1, col=1)

    # diff plot
    fig.add_trace(go.Scatter(x=df_diff.index, y=df_diff[column_name], mode='lines', name='After Differencing'), row=1, col=2)
    fig.update_xaxes(title_text='Date', row=1, col=2)
    fig.update_yaxes(title_text='Value', row=1, col=2)

    fig.update_layout(height=400, width=950, showlegend=False, title_text=f"{gas_name}: Original vs After Differencing Plot")
    fig.show()


In [489]:
original_and_difference(df4_co2, diff_co2, column_name = 'cycle', gas_name = 'CO2')

In [497]:
original_and_difference(df4_n2o, diff_n2o, column_name = 'average', gas_name = 'Nitrous-Oxide')

In [498]:
original_and_difference(df4_methane, diff_methane, column_name = 'average', gas_name = 'Methane')

- From the Plots, we can say that our data is Stationary.

In [499]:
df5_co2 = df4_co2.iloc[:, :1]
df5_n2o = df4_n2o.iloc[:, :1]
df5_methane = df4_methane.iloc[:, :1]

## Modelling

### Splitting the data

In [395]:
from sklearn.model_selection import train_test_split

In [482]:
# splitting differenced time series data
# df_train = earlier part
# df_test = later part

df_co2_train, df_co2_test = train_test_split(diff_co2, test_size=0.2, shuffle=False)

df_n2o_train, df_n2o_test = train_test_split(diff_n2o, test_size=0.2, shuffle=False)

df_methane_train, df_methane_test = train_test_split(diff_methane, test_size=0.2, shuffle=False)

In [483]:
def acf_and_pacf_lag(gas_name, df, column_name):
    acf_lag = acf(df[column_name].values)
    pacf_lag = pacf(df[column_name].values, method='ols')

    fig = make_subplots(rows=1, cols=2, subplot_titles=['Autocorrelation Function', 'Partial Autocorrelation Function'])

    # ACF
    fig.add_trace(go.Scatter(x=np.arange(len(acf_lag)), y=acf_lag, mode='lines', name='ACF'), row=1, col=1)
    fig.update_xaxes(title_text='Lag', row=1, col=1)
    fig.update_yaxes(title_text='Autocorrelation', row=1, col=1)

    # PACF
    fig.add_trace(go.Scatter(x=np.arange(len(pacf_lag)), y=pacf_lag, mode='lines', name='PACF'), row=1, col=2)
    fig.update_xaxes(title_text='Lag', row=1, col=2)
    fig.update_yaxes(title_text='Partial Autocorrelation', row=1, col=2)

    fig.update_layout(height=400, width=900, showlegend=False, title_text=f"{gas_name}: ACF and PACF Plots")
    fig.show()

In [484]:
acf_and_pacf_lag('CO2', df_co2_train, column_name = 'cycle')

In [485]:
acf_and_pacf_lag('N2O', df_n2o_train, column_name = 'average')

In [486]:
acf_and_pacf_lag('Methane', df_methane_train, column_name = 'average')

In [487]:
# model = ARIMA(df_methane_train.values, order=(2,0,3))
# model_fit = model.fit(disp=0)
# print(model_fit.summary())

# p, d, q

model = sm.tsa.arima.ARIMA(train_data, order=(1,1,0))
result = model.fit()

NotImplementedError: 
statsmodels.tsa.arima_model.ARMA and statsmodels.tsa.arima_model.ARIMA have
been removed in favor of statsmodels.tsa.arima.model.ARIMA (note the .
between arima and model) and statsmodels.tsa.SARIMAX.

statsmodels.tsa.arima.model.ARIMA makes use of the statespace framework and
is both well tested and maintained. It also offers alternative specialized
parameter estimators.
