In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

### Data Import & Reading

In [None]:
global_temp=pd.read_csv('../input/global-climate-change/GlobalTemperatures.csv')

In [None]:
global_temp.head()

### Fetching year from date column

In [None]:
global_temp['dt'][0].split('-')[0]

In [None]:
def fetch_year(date):
    return date.split('-')[0]

In [None]:
global_temp['Year']= global_temp['dt'].apply(fetch_year)

In [None]:
global_temp.head()

In [None]:
data = global_temp.groupby('Year').agg({'LandAverageTemperature':'mean','LandAverageTemperatureUncertainty':'mean'}).reset_index()

In [None]:
data.head()

### Creating derieved columns

In [None]:
## creating top and bottom temp uncertainity

data['Uncertainity_top'] = data['LandAverageTemperature']+ data['LandAverageTemperatureUncertainty']
data['Uncertainity_bottom'] = data['LandAverageTemperature']- data['LandAverageTemperatureUncertainty']

In [None]:
data.head()

In [None]:
fig = px.line(data, x='Year', y=['LandAverageTemperature','Uncertainity_top','Uncertainity_bottom'], title='World avg.Land Temp')
fig.show()

#### We can notice that the avg. temp has been increasing in the past 30 years, indicating a rapid increase in global warming.

### Analyzing avg.temperature across different season

In [None]:
## changing date column to datetime

global_temp['dt'] = pd.to_datetime(global_temp['dt'])

In [None]:
global_temp.dtypes

In [None]:
## creating month col from date col
global_temp['month']=global_temp['dt'].dt.month

In [None]:
## defining function to map seasons on month col

def get_season(month):
    if month >= 3 and month <=4:
        return 'spring'
    elif month >=5 and month <=6:
        return 'summer'
    elif month >=7 and month <=8:
        return 'rainy'
    elif month >=9 and month <=11:
        return 'autumn'
    else:
        return 'winter'

In [None]:
global_temp['season']=global_temp['month'].apply(get_season)

In [None]:
global_temp.head()

In [None]:
## create a list of years
years = global_temp['Year'].unique()

In [None]:
## create empty season list
spring_temp = []
summer_temp = []
rainy_temp = []
autumn_temp = []
winter_temp = []

In [None]:
for year in years:
    current_df= global_temp[global_temp['Year']==year]
    spring_temp.append(current_df[current_df['season']=='spring']['LandAverageTemperature'].mean())
    summer_temp.append(current_df[current_df['season']=='summer']['LandAverageTemperature'].mean())
    rainy_temp.append(current_df[current_df['season']=='rainy']['LandAverageTemperature'].mean())
    autumn_temp.append(current_df[current_df['season']=='autumn']['LandAverageTemperature'].mean())
    winter_temp.append(current_df[current_df['season']=='winter']['LandAverageTemperature'].mean())

In [None]:
## creating a season df to store values of all seasons
season =pd.DataFrame()

In [None]:
season['year']=years
season['spring_temp'] = spring_temp
season['summer_temp'] = summer_temp
season['rainy_temp'] = rainy_temp
season['autumn_temp'] = autumn_temp
season['winter_temp'] = winter_temp

In [None]:
season.head()

In [None]:
fig=px.line(season, x='year', y=['spring_temp', 'summer_temp', 'rainy_temp', 'autumn_temp','winter_temp'],
         title='Avg.temp across season')
fig.show()

#### We can notice that from the past 3-4 decades avg. seasonal temepratures have been increasing worldwide.

### Data Preprocessing
#### For our timeseries model we'll use cities dataset.

In [None]:
cities = pd.read_csv("../input/global-climate-change/GlobalLandTemperaturesByCity.csv")

In [None]:
cities.head()

In [None]:
cities.shape

In [None]:
## filtering USA data
usa = cities[cities['Country']== 'United States']

In [None]:
usa.shape

In [None]:
## extracting specific cities from usa data

data = ['New York','Los Angeles','San Francisco']
data2 = usa[usa['City'].isin(data)]

In [None]:
data2.head()

In [None]:
data2=data2[['dt','AverageTemperature']]

In [None]:
data2.head()

In [None]:
## renaming columns
data2.columns=['Date','Temperature']

In [None]:
data2.head()

In [None]:
data2.dtypes

In [None]:
## changing date column to datetime
data2['Date']=pd.to_datetime(data2['Date'])

In [None]:
data2.shape

In [None]:
data2.isna().sum()

In [None]:
## since there are very few null values, therefore we'll drops the rows
data2.dropna(inplace=True)

In [None]:
data2.shape

In [None]:
## setting the date column as index for timeseries model.
data2.set_index('Date',inplace=True)

In [None]:
data2.head()

### Data Stability Check

In [None]:
## lets visulize the data with line plot
sns.lineplot(x=data2.index, y=data2['Temperature'])

#### From the above plot, we are not able to draw concrete inference about data stability, therefore we'll use statistical test to check it.

### Augmented Dickey-Fuller test

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
test_result=adfuller(data2['Temperature'])

In [None]:
test_result

#### Based on the above test where p-vale is >0.05, we fail to reject the null hypothesis, data has unit root & is Non-stationary

### Making the data stationary

In [None]:
## making a copy
df = data2.copy()

In [None]:
df.head()

#### year has a 12 month cycle therefore we are going to shift the temp by 12 places and then calculate the temp difference.

In [None]:
df['first_temp_diff']= df['Temperature']-df['Temperature'].shift(12)

In [None]:
df.head(20)

In [None]:
df.shape

In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
test_result1=adfuller(df['first_temp_diff'])

In [None]:
test_result1

#### The p-value is almost zero and data seems to be stationary, we can verify it by plotting.

In [None]:
df[['first_temp_diff']].plot(figsize=(25,12))

### Seasonality Check

In [None]:
## creating month column
data2['month']=data2.index.month

## creating year column
data2['year']=data2.index.year

In [None]:
data2.head()

In [None]:
pivot=data2.pivot_table(values='Temperature', index='month', columns='year')

In [None]:
pivot

In [None]:
monthly_seasonality=pivot.mean(axis=1)

In [None]:
monthly_seasonality.plot(figsize=(20,6))

#### The data shows monthly seasonality as the avg. temperature across all years show a rise in month of June to August.

### Moving Average Method

In [None]:
## for time series model we need stationary data
df.head()

In [None]:
## subset the column first_temp_diff
df= df[['first_temp_diff']]

In [None]:
df.head()

In [None]:
## caluclating rolling mean
df['first_temp_diff'].rolling(window=5).mean()

In [None]:
value = pd.DataFrame(df['first_temp_diff'])

In [None]:
## merging two results
temp_df= pd.concat([value, df['first_temp_diff'].rolling(window=5).mean()],axis=1)

In [None]:
## renaming columns
temp_df.columns=['actual_temp','forcast_temp']

In [None]:
temp_df.head()

### Model Evaluation

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
## calulating metrics from 5th row value excluding NaN
np.sqrt(mean_squared_error(temp_df['forcast_temp'][4:],temp_df['actual_temp'][4:]))

#### The above error rate indicates that on any given day there will be a (+/-) 2.39 difference in temp.

### ARIMA Model

In [None]:
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf

In [None]:
df.head()

In [None]:
plot_acf(df['first_temp_diff'].dropna())

In [None]:
plot_pacf(df['first_temp_diff'].dropna())

In [None]:
df.isna().sum()

In [None]:
from statsmodels.tsa.arima_model import ARIMA

In [None]:
df.shape

In [None]:
## train test split

train_data = df[0:6000]
test_data = df[6000:]

In [None]:
arima = ARIMA(train_data, order=(2,1,3))

In [None]:
model = arima.fit()

In [None]:
## forcasting for 10 days on train data

model.forecast(steps=10)[0]

### Forecasting on Test data

In [None]:
predictions=model.forecast(steps=len(test_data))[0]

In [None]:
predictions

In [None]:
from pandas import DataFrame

In [None]:
predict = pd.DataFrame(predictions, columns=["pred"])

### Model Evaluation

In [None]:
np.sqrt(mean_squared_error(test_data, predictions))

### Conclusion:
#### Compared to the moving avg method error rate of 2.39, ARIMA error rate is less and therefore a better model for temperature forecast.