In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
from copy import copy
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
import plotly.offline as py
import plotly.express as px
import plotly.graph_objects as go



In [None]:
!pip install jupyterthemes

In [None]:
from jupyterthemes import jtplot
jtplot.style(theme="monokai", context="notebook",ticks=True,grid=False)

## 1. Exploratory Data Analysis:

In [None]:
df = pd.read_csv("../input/global-land-temperatures-by-country/GlobalLandTemperaturesByCountry.csv")
df.head()

In [None]:
df.info()
# we see that 577462 rows and 4 columns in our dataset
# 2 of columns have numerical values while two of them have non-numerical values

In [None]:
df["Country"].nunique()
#It seems there 243 countries in the dataset

In [None]:
df.isnull().sum()
# It seems we have missing data in two columns

In [None]:
df.describe()
# we get overall statistical information about the dataset

In [None]:
df[df["AverageTemperature"] > 36]

## 2.  Data Cleaning and Feature Engineering:

In [None]:
country_group= df.groupby(by="Country").count().reset_index()
country_group.sort_values(by="AverageTemperature",ascending=False)

In [None]:
fig=px.bar(country_group, x="Country",y="AverageTemperature")
fig.show()

In [None]:
fig=px.bar(country_group, x="Country",y="AverageTemperatureUncertainty")
fig.show()

In [None]:
px.histogram(country_group,x="AverageTemperature")

In [None]:
#Because missing data, we can not have a good distribution, lets ignore countries that has lwer than 1500 average temperature
country_group[country_group["AverageTemperature"]> 1500]

In [None]:
px.histogram(country_group[country_group["AverageTemperature"]> 1500],x="AverageTemperature")

In [None]:
df["AverageTemperature"].fillna(df["AverageTemperature"].mean(),inplace=True)
df["AverageTemperatureUncertainty"].fillna(df["AverageTemperatureUncertainty"].mean(),inplace=True)
df.isnull().sum()

In [None]:
df["Country"].unique()
#Here we see that some countries are dublicated with an extra parantheses in order to refer them they are in Europe
# We need to get rid of them

In [None]:
duplicates=list()
for i in df["Country"].unique():
    if "(" in i:
        duplicates.append(i)
duplicates

In [None]:
df=df.replace(duplicates,["Congo","Denmark","Falkland Islands","France","Netherlands","United Kingdom"])
                            
df["Country"].unique() 
#Now our data is without duplicates

## 3. Data Visualization:

In [None]:
countries = df["Country"].unique().tolist()
countries[:3]

In [None]:
mean_temperatures = list()
for i in countries:
    mean_temperatures.append(df[df["Country"]==i]["AverageTemperature"].mean())
mean_temperatures[:3]

In [None]:
#Lets visualize average temperatures by Country in the world map:
data = [dict(type ="choropleth",
       locations = countries,
       z = mean_temperatures,
       locationmode = "country names")]

layout = dict(title = "Average Global Country Temperatures",
             geo = dict(showframe = False,
                       showocean = True,
                        oceancolor = "aqua",
                         projection = dict(type = "orthographic")))
fig = dict(data = data, layout=layout)
py.iplot(fig, validate = False, filename = "worldmap")

In [None]:
#Lets creat a new visualization that shows global temperature change within a animation:
df["Year"] = df["dt"].apply(lambda x : x.split("-")[0]) #We create a new column for years, because we will animate it by year
df.head()

In [None]:
# we will use plotly express fro this animation
fig = px.choropleth(data_frame = df,
                   locations = "Country",
                   locationmode = "country names",
                   color = "AverageTemperature",
                   hover_name = "Country",
                   animation_frame = "Year",
                   color_continuous_scale = px.colors.sequential.OrRd)
fig.show()

In [None]:
df_global = df.groupby("Year").mean().reset_index()
df_global# Here we group data by Year and create a new data frame which takes care of the mean of everything according to the groups

In [None]:
#Because we have a lot of missing data before 1850, we will get rid of the data before 1850
df_global["Year"] = df_global["Year"].apply(lambda x : int(x))
df_global = df_global[df_global["Year"] >= 1850]
df_global.head()

In [None]:
data = go.Scatter(x=df_global["Year"],
                  y=df_global["AverageTemperature"],
                  name="Average Temperature",
                line = dict(color="red"))
layout = go.Layout(xaxis = dict(title="Year"),
                   yaxis = dict(title ="Average Temperature in Celcius"),
                   title = "Global Avreage Temperature Since 1850",
                  showlegend=False)
fig = go.Figure(data=data,layout = layout)
py.iplot(fig)
#As we can see there is a significant increase in global climate

In [None]:
# we will visualize a specific country
norway_df = df[df["Country"]== "Norway"].reset_index(drop=True)
norway_df

In [None]:
fig = px.line(title = "Norway Temperature Data")
fig.add_scatter(x = norway_df["Year"], y = norway_df["AverageTemperature"], name = "Norway Temperature Over Years")
fig.show()

## 4. Moving Average Method:

<font colot = "blue">
In statistics, a moving average is a calculation used to analyze data points by creating a series of averages of different subsets of the full data set. In finance, a moving average (MA) is a stock indicator that is commonly used in technical analysis. The reason for calculating the moving average of a stock is to help smooth out the price data by creating a constantly updated average price.

In [None]:
df_global.head() # we will look at the data within a global scale

In [None]:
series = df_global["AverageTemperature"].values
time = df_global["Year"].values
plt.figure(figsize=(20, 15))
plt.plot(time, series)
plt.xlabel("Time")
plt.ylabel("Value")
plt.grid(True)

In [None]:
#Forecasts the mean of the last few values.If window_size=1, then this is equivalent to naive forecast"""
def moving_average_forecast(series, window_size):
    forecast = []
    for time in range(len(series) - window_size):
        forecast.append(series[time:time + window_size].mean())
    return np.array(forecast)

In [None]:
split_time =150 
time_train = time[:split_time]
series_train = series[:split_time]
time_test = time[split_time:]
series_test = series[split_time:]
moving_avg = moving_average_forecast(series, 1)[split_time - 1:]

plt.figure(figsize=(15, 10))
plt.plot(time_test, series_test)
plt.xlabel("Time")
plt.ylabel("Value")

plt.figure(figsize=(15, 10))
plt.plot(time_test, moving_avg)
plt.xlabel("Time")
plt.ylabel("Value")


In [None]:
import tensorflow
print(tensorflow.keras.metrics.mean_squared_error(series_test, moving_avg).numpy())
print(tensorflow.keras.metrics.mean_absolute_error(series_test, moving_avg).numpy())
#The results are good

## 5. Deep Neural Networks for Time Series

In [None]:
df_global

In [None]:
series = df["AverageTemperature"]
time = df["Year"].values
window_size = 1
batch_size = 10

In [None]:
import tensorflow as tf
def windowed_dataset(series, window_size, batch_size):
    series = tf.expand_dims(series, axis=-1)
    ds = tf.data.Dataset.from_tensor_slices(series)
    ds = ds.window(window_size + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda w: w.batch(window_size + 1))
    ds = ds.map(lambda w: (w[:-1], w[1:]))
    return ds.batch(batch_size).prefetch(1)

In [None]:
window_size = 1
batch_size = 10
train_set = windowed_dataset(time_train, window_size, batch_size)
print(train_set)
print(time_train.shape)
model = tf.keras.models.Sequential([
  tf.keras.layers.Conv1D(filters=32, kernel_size=5,
                      strides=1, padding="causal",
                      activation="relu",
                      input_shape=[None, 1]),
  tf.keras.layers.LSTM(64, return_sequences=True),
  tf.keras.layers.LSTM(64, return_sequences=True),
  tf.keras.layers.Dense(30, activation="relu"),
  tf.keras.layers.Dense(10, activation="relu"),
  tf.keras.layers.Dense(1),
  tf.keras.layers.Lambda(lambda x: x * 400)
])

lr_schedule = tf.keras.callbacks.LearningRateScheduler(
    lambda epoch: 1e-8 * 10**(epoch / 20))
optimizer = tf.keras.optimizers.SGD(lr=1e-8, momentum=0.9)
model.compile(loss=tf.keras.losses.Huber(),
              optimizer=optimizer,
              metrics=["mae"])
history = model.fit(train_set, epochs=100, callbacks=[lr_schedule])

In [None]:
pd.DataFrame(model.history.history).plot()