In [1]:
import pandas as pd
import numpy as np

dataset: https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data

geografic visualization: https://www.kaggle.com/amelinvladislav/map-of-temperatures-and-analysis-of-global-warming

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import seaborn as sns
import time
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("GlobalLandTemperaturesByCity.csv")

In [4]:
df = df.dropna()

In [5]:
df.head(3)

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
5,1744-04-01,5.788,3.624,Århus,Denmark,57.05N,10.33E
6,1744-05-01,10.644,1.283,Århus,Denmark,57.05N,10.33E


In [6]:
df["dt"] = pd.to_datetime(df["dt"])

In [8]:
df = df[df["dt"].dt.year>2000]

del df["AverageTemperatureUncertainty"]

In [9]:
#global_temp_country_clear = df[~df['Country'].isin(
#    ['Denmark', 'Antarctica', 'France', 'Europe', 'Netherlands',
#     'United Kingdom', 'Africa', 'South America'])]
#df = 0
#global_temp_country_clear = global_temp_country_clear.replace(
#   ['Denmark (Europe)', 'France (Europe)', 'Netherlands (Europe)', 'United Kingdom (Europe)'],
#   ['Denmark', 'France', 'Netherlands', 'United Kingdom'])

#Let's average temperature for each country

## Gráfico generado con datos originales

In [10]:
countries = np.unique(df['Country'])
mean_temp = []
for country in countries:
    mean_temp.append(df[df['Country'] == country]['AverageTemperature'].mean())
    
data = [ dict(
        type = 'choropleth',
        locations = countries,
        z = mean_temp,
        locationmode = 'country names',
        text = countries,
        marker = dict(
            line = dict(color = 'rgb(0,0,0)', width = 1)),
            colorbar = dict(autotick = True, tickprefix = '', 
            title = '# Average\nTemperature,\n°C')
            )
       ]

layout = dict(
    title = 'Average land temperature in countries',
    geo = dict(
        showframe = False,
        showocean = True,
        oceancolor = 'rgb(0,255,255)',
        projection = dict(
        type = 'orthographic',
            rotation = dict(
                    lon = 60,
                    lat = 10),
        ),
        lonaxis =  dict(
                showgrid = True,
                gridcolor = 'rgb(102, 102, 102)'
            ),
        lataxis = dict(
                showgrid = True,
                gridcolor = 'rgb(102, 102, 102)'
                )
            ),
        )
fig = dict(data=data, layout=layout)
py.iplot(fig, validate=False, filename='worldmap',image_width=1600,
    image_height=1200,
    show_link=True,)

## Elimino datos

In [29]:
df.head(3)

Unnamed: 0,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,dt,temp_pred
3086,18.934333,0.381,Århus,Denmark,57.05,10.33,2001-01-01,18.934333
3087,18.934333,0.328,Århus,Denmark,57.05,10.33,2001-02-01,18.934333
3088,18.934333,0.236,Århus,Denmark,57.05,10.33,2001-03-01,18.934333


In [30]:
countries = ["Niger", "Tunisia", "Ukraine", "Moldova", "Sweeden","Syria", "Congo","Austria","Mali", "Belgium", "Egypt"]

In [31]:
df_incomplete = df[["AverageTemperature","Country","Latitude","Longitude"]][~df.Country.isin(countries)]

In [32]:
df_incomplete.head(3)

Unnamed: 0,AverageTemperature,Country,Latitude,Longitude
3086,18.934333,Denmark,57.05,10.33
3087,18.934333,Denmark,57.05,10.33
3088,18.934333,Denmark,57.05,10.33


In [33]:
countries = np.unique(df_incomplete['Country'])
mean_temp = []
for country in countries:
    mean_temp.append(df_incomplete[df_incomplete['Country'] == country]['AverageTemperature'].mean())

data = [ dict(
        type = 'choropleth',
        locations = countries,
        z = mean_temp,
        locationmode = 'country names',
        text = countries,
        marker = dict(
            line = dict(color = 'rgb(0,0,0)', width = 1)),
            colorbar = dict(autotick = True, tickprefix = '', 
            title = '# Average\nTemperature,\n°C')
            )
       ]

layout = dict(
    title = 'Average land temperature in countries',
    geo = dict(
        showframe = False,
        showocean = True,
        oceancolor = 'rgb(0,255,255)',
        projection = dict(
        type = 'orthographic',
            rotation = dict(
                    lon = 60,
                    lat = 10),
        ),
        lonaxis =  dict(
                showgrid = True,
                gridcolor = 'rgb(102, 102, 102)'
            ),
        lataxis = dict(
                showgrid = True,
                gridcolor = 'rgb(102, 102, 102)'
                )
            ),
        )
fig = dict(data=data, layout=layout)
py.iplot(fig, validate=False, filename='worldmap',image_width=800,
    image_height=600,
    show_link=True,)

## Reemplazando la data faltante

In [None]:
df_test = df[~df.Country.isin(df_incomplete.Country)]

In [None]:
df_test.Longitude = df_test["Longitude"].apply(lambda x: float(x[:-1]) if x[-1]=="E" else -float(x[:-1]))
df_test.Latitude = df_test["Latitude"].apply(lambda x: float(x[:-1]) if x[-1]=="N" else -float(x[:-1])) 

In [None]:
df_test.to_pickle("df_test.pkl")

In [None]:
df_incomplete.Longitude = df_incomplete["Longitude"].apply(lambda x: float(x[:-1]) if x[-1]=="E" else -float(x[:-1]))
df_incomplete.Latitude = df_incomplete["Latitude"].apply(lambda x: float(x[:-1]) if x[-1]=="N" else -float(x[:-1])) 

In [None]:
df_incomplete.to_pickle("df_train.pkl")

temp_city.Latitude = temp_city.Latitude.apply(lambda x: float(x[:-1]) if x[-1]=="N" else -float(x[:-1])),
temp_city.Longitude = temp_city.Longitude.apply(lambda x: float(x[:-1]) if x[-1]=="E" else -float(x[:-1]))
    

In [None]:
df_incomplete.head()

In [None]:
df_test = 0

## Entrenando el modelo

In [11]:
from sklearn import neighbors
from sklearn.externals import joblib
from sklearn import metrics

In [12]:
train = pd.read_pickle("df_train.pkl")
test = pd.read_pickle("df_test.pkl")

train.to_pickle("df_train.pkl")
test.to_pickle("df_test.pkl")

In [16]:
X = train[["Latitude", "Longitude"]]

In [17]:
y = np.array(train["AverageTemperature"]).reshape(-1, 1)

In [18]:
X = np.array(X)

In [None]:
knn = neighbors.KNeighborsRegressor(n_neighbors=3, weights="distance", metric="haversine")
y_ = knn.fit(X, y)

In [None]:
joblib.dump(knn, "geografic_temp_regresor.pkl")

## Usando el modelo de predicción

In [13]:
knn = joblib.load("geografic_temp_regresor.pkl")

In [19]:
test["temp_pred"] = knn.predict(test[["Latitude", "Longitude"]])

In [20]:
print('Mean Absolute Error:', metrics.mean_absolute_error(test["AverageTemperature"], test["temp_pred"]))  
print('Mean Squared Error:', metrics.mean_squared_error(test["AverageTemperature"], test["temp_pred"]))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(test["AverageTemperature"], test["temp_pred"])))

Mean Absolute Error: 8.99431754927442
Mean Squared Error: 138.18791369407444
Root Mean Squared Error: 11.755335541534935


In [21]:
test["AverageTemperature"] = test["temp_pred"]

In [22]:
df = pd.concat([test,train])

In [27]:
test[test.Country =="Uckraine"]

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,temp_pred


In [34]:
countries = np.unique(df['Country'])
mean_temp = []
for country in countries:
    mean_temp.append(df[df['Country'] == country]['AverageTemperature'].mean())
    
data = [ dict(
        type = 'choropleth',
        locations = countries,
        z = mean_temp,
        locationmode = 'country names',
        text = countries,
        marker = dict(
            line = dict(color = 'rgb(0,0,0)', width = 1)),
            colorbar = dict(autotick = True, tickprefix = '', 
            title = '# Average\nTemperature,\n°C')
            )
       ]

layout = dict(
    title = 'Average land temperature in countries',
    geo = dict(
        showframe = False,
        showocean = True,
        oceancolor = 'rgb(0,255,255)',
        projection = dict(
        type = 'orthographic',
            rotation = dict(
                    lon = 60,
                    lat = 10),
        ),
        lonaxis =  dict(
                showgrid = True,
                gridcolor = 'rgb(102, 102, 102)'
            ),
        lataxis = dict(
                showgrid = True,
                gridcolor = 'rgb(102, 102, 102)'
                )
            ),
        )
fig = dict(data=data, layout=layout)
py.iplot(fig, validate=False, filename='worldmap',image_width=1600,
    image_height=1200,
    show_link=True,)