## t-SNE Application to European Electricity Generation

An application of t-SNE to European countries' electricity and heat generation data, to cluster countries visually, according to their energy mix.

Data source: Eurostat

In [1]:
import pandas as pd
from sklearn.manifold import TSNE
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go

In [2]:
# Import processed data
df = pd.read_csv('data/processed_data.csv')
df =  df.rename(columns = {'Unnamed: 0': 'Code'})
# Import country codes
cc = pd.read_excel('data/countrycodes.xlsx')

In [3]:
# Merge country info onto dataframe
df = df.merge(cc, on='Code', how='left')

In [4]:
df.head()

Unnamed: 0,Code,Solid fossil fuels,Manufactured gases,Peat and peat products,Oil shale and oil sands,Oil and petroleum products,Natural gas,Renewables and biofuels,Nuclear heat,total,Country,Region
0,PT,0.340511,0.0,0.0,0.0,0.026388,0.351867,0.281234,0.0,9504.878469,Portugal,S Europe
1,ME,0.753639,0.0,0.0,0.0,0.0,0.0,0.246361,0.0,391.21532,Montenegro,C&E Europe
2,LV,0.002322,0.0,0.0,0.0,0.000712,0.404943,0.592023,0.0,1588.10701,Latvia,N Europe
3,HU,0.14906,0.020058,0.0,0.0,0.003234,0.242202,0.101624,0.483822,8441.117178,Hungary,C&E Europe
4,BG,0.504866,0.0,0.0,0.0,0.014892,0.074904,0.051769,0.353568,11145.509744,Bulgaria,C&E Europe


In [5]:
# Apply t-SNE to data
tsne = TSNE(perplexity = 20, learning_rate = 10, n_iter = 5000).fit_transform(df.iloc[:,1:9])
# Add results as columns in dataframe
df[['tsne1', 'tsne2']] = pd.DataFrame(tsne)

In [6]:
# Add column with text to display in plot
df['text'] = '<b>'+df['Country'].astype('str') + '</b><br>' + 'Solid fossil fuels: ' + (df['Solid fossil fuels']*100).round().astype('str') + '%' \
                                         + '<br>' + 'Manufactured gases: ' + (df['Manufactured gases']*100).round().astype('str') + '%'\
                                         + '<br>' + 'Peat and peat products: ' + (df['Peat and peat products']*100).round().astype('str') + '%' \
                                         + '<br>' + 'Oil shale and oil sands: ' + (df['Oil shale and oil sands']*100).round().astype('str') + '%' \
                                         + '<br>' + 'Oil and petroleum products: ' + (df['Oil and petroleum products']*100).round().astype('str') +  '%'\
                                         + '<br>' + 'Natural gas: ' + (df['Natural gas']*100).round().astype('str') + '%'\
                                         + '<br>' + 'Renewables and biofuels: ' + (df['Renewables and biofuels']*100).round().astype('str') + '%'\
                                         + '<br>' + 'Nuclear heat: ' + (df['Nuclear heat']*100).round().astype('str') + '%'\

In [9]:
# Create plotly plot
data = []

for region in df['Region'].unique():
    mask = df['Region'] == region
    trace = go.Scatter(x = df.loc[mask, 'tsne1'],
                y = df.loc[mask, 'tsne2'],
                name = region,
                mode = 'markers',
                marker = {'size': df.loc[mask,'total']/1100,
                        'opacity': 0.9,
                        'colorscale': 'Portland'
                       },
                     hoverinfo = 'text',
                       text = df.loc[mask,'text'])
    data.append(trace)
    
layout= go.Layout(
    title= 'European Electricity Energy Mix: t-SNE representation',
    hovermode= 'closest',

    xaxis = dict(
        zeroline= False,
        mirror = True,
        showline=True,
        showticklabels=False,
        showgrid=False
    ),
    yaxis= dict(
        zeroline= False,
        mirror = True,
        showline=True,
        showticklabels=False,
        showgrid=False
    ),
    showlegend= True
)

fig = go.Figure(data = data, layout = layout)
py.plot(fig, filename='europe_electricity_tsne', auto_open=True)

'https://plot.ly/~sam.maule/118'