# Time series analysis

In [None]:
import pandas as pd

# Transformation
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# KMEANS
from sklearn.cluster import KMeans
from yellowbrick.cluster.elbow import KElbowVisualizer 
from yellowbrick.cluster import silhouette_visualizer 

# tslearn
from tslearn.utils import to_time_series, to_time_series_dataset
from tslearn.clustering import TimeSeriesKMeans

# Visualization
import plotly.express as px
import plotly.io as pio
pd.options.plotting.backend = "plotly"
pio.templates.default = "seaborn"
import matplotlib.pyplot as plt
# geo (visualization)

pd.set_option('display.max_columns', None)

In [None]:
# Read datasets
df_temperature = pd.read_csv("../datasets/CityGlobalTemperature2000-2009.csv", index_col=0) # index_col takes the index from the csv rather than creating it automatically (i.e. unnamed col is removed)

In [None]:
df_temperature.head()

## Data Anlaysis

Null values and Type analysis

In [None]:
df_temperature.info()

I verify that each city has the same Country, Latitude and Longitude in all its records.

In [None]:
something_different = False

for city in df_temperature.City.unique():
    query = df_temperature['City'] == city

    if not df_temperature[query].loc[:, 'Country'].eq(df_temperature[query].loc[:, 'Country'].iloc[0]).all():
        something_different = True
        print(city + ' has different Country')
    if not df_temperature[query].loc[:, 'Latitude'].eq(df_temperature[query].loc[:, 'Latitude'].iloc[0]).all():
        something_different = True
        print(city + ' has different Latitude')
    if not df_temperature[query].loc[:, 'Longitude'].eq(df_temperature[query].loc[:, 'Longitude'].iloc[0]).all():
        something_different = True
        print(city + ' has different Longitude')

if not something_different:
    print('Each city has the same Country, Latitude and Longitude')

Studying the uncertanty related to average temperature for each city

In [None]:
df_temperature.sort_values(by='AverageTemperatureUncertainty')

Defining a new dataframe df_city

In [None]:
df_city = df_temperature.groupby('City').agg('mean')
df_city = df_city.join(df_temperature.groupby('City').Country.first())
df_city = df_city.join(df_temperature.groupby('City').Latitude.first())
df_city = df_city.join(df_temperature.groupby('City').Longitude.first())

px.scatter_3d(df_city, x = 'Longitude', y='Latitude', z='AverageTemperatureUncertainty', color='Country')

Here we plot the average temperature of a City across time to appreciate the cyclic behaviour

In [None]:
df_example = df_temperature[df_temperature['City'] == 'Lahore']
df_example = df_example.set_index('City')

fig = px.line(df_example, x='time', y="AverageTemperature")
fig.show()

Here we plot the average temperature uncertanty of a City across time to look for pattern.

Just by looking to the plot we can't find any.

In [None]:
df_example = df_temperature[df_temperature['City'] == 'Nagoya']
df_example = df_example.set_index('City')

fig = px.line(df_example, x='time', y="AverageTemperatureUncertainty")
fig.show()

## Data Transformation and Feature Engegneering

Creating 12 new attributes related to the average temperature that occured in each month across all the years.

Also created 12 new attributes related to the average temperature uncertanty that occured in each month across all the years.

In [None]:
df_temperature['year'] = pd.DatetimeIndex(df_temperature['time']).year
df_temperature['month'] = pd.DatetimeIndex(df_temperature['time']).month

df_moths_average_temp = df_temperature.set_index('City').pivot_table('AverageTemperature', 'City', 'month').rename_axis(None, axis=1)#.reset_index(drop=True)
df_moths_average_temp.rename(columns={1:'1_avg', 2:'2_avg',3:'3_avg', 4:'4_avg', 5:'5_avg', 6:'6_avg', 7:'7_avg', 8:'8_avg',9:'9_avg', 10:'10_avg',11:'11_avg', 12:'12_avg',}, inplace=True)

df_moths_average_temp_var = df_temperature.set_index('City').pivot_table('AverageTemperatureUncertainty', 'City', 'month').rename_axis(None, axis=1)#.reset_index(drop=True)
df_moths_average_temp_var.rename(columns={1:'1_var', 2:'2_var',3:'3_var', 4:'4_var', 5:'5_var', 6:'6_var', 7:'7_var', 8:'8_var',9:'9_var', 10:'10_var',11:'11_var', 12:'12_var',}, inplace=True)

df_city = df_city.join(df_moths_average_temp, on='City')
df_city = df_city.join(df_moths_average_temp_var, on='City')
df_city

Transforming latitude and longitude from String to int

In [None]:
# conversion functions
def latitude_to_float(latitude):

    if(latitude[-1] == 'N'):
        return float(latitude[0:-1])
    elif(latitude[-1] == 'S'):
        return - float(latitude[0:-1])
    else:
        print('Conversion error: ' + latitude)

def longitude_to_float(longitude):

    if(longitude[-1] == 'W'):
        return - float(longitude[0:-1])
    elif(longitude[-1] == 'E'):
        return float(longitude[0:-1])
    else:
        print('Conversion error: ' + longitude)


# applying conversions
df_city.Latitude = df_city.Latitude.apply(lambda x: latitude_to_float(x))
df_city.Longitude = df_city.Longitude.apply(lambda x: longitude_to_float(x))
df_city

In [None]:
df_city.info()

## Cluster

Below you can set boolean values to true or false depending if you want to set the clustering alghortithm on average temperature, temperature variance or both.

After trying all the different combinations the most meanigfull results were obatining by clustering with respect to the average temperature alogn the months.

In [None]:
cluster_avg = True
cluster_var = False
cluster_var_mean = False

Preparing the dataset

In [None]:
features_avg = ['1_avg', '2_avg', '3_avg', '4_avg', '5_avg', '6_avg', '7_avg', '8_avg', '9_avg', '10_avg', '11_avg', '12_avg']
features_var = ['1_var', '2_var','3_var', '4_var', '5_var', '6_var', '7_var', '8_var','9_var', '10_var', '11_var', '12_var']
features = []

if cluster_avg == True:
    features = features  + features_avg
if cluster_var == True:
    features =  features + features_var
if cluster_var_mean == True:
    features = features + ['AverageTemperatureUncertainty']

df_data = df_city[features].reset_index(drop=True)

#df_data = pd.DataFrame(MinMaxScaler().fit_transform(df_data), columns=df_data.columns)
df_data

Trying varoius k for k-means and comparing metrics:

In [None]:
model = KMeans(n_init=10, max_iter=100, init="k-means++")
sse_visualizer = KElbowVisualizer(model, k=(2,30), timings=False)
sse_visualizer.fit(df_data)
sse_visualizer.show()

sil_visualizer = KElbowVisualizer(model, k=(2,30), timings=False, metric="silhouette")
sil_visualizer.fit(df_data)
sil_visualizer.show()

In [None]:
optimal_k = 7#sse_visualizer.elbow_value_
kmeans = KMeans(n_clusters=optimal_k, n_init=10, max_iter=100, init="k-means++")
kmeans.fit(df_data)

df_city["cluster_kmeans"] = kmeans.labels_.astype(str)

x = silhouette_visualizer(KMeans(optimal_k, random_state=42), df_data)
print("The silhoutte score is: " + str(x.silhouette_score_))

### Cluster Validation

#### Centroid visualization

In [None]:
if cluster_avg == True and cluster_var == False:
    plt.figure(figsize=(15, 4))
    for i in range(0, len(kmeans.cluster_centers_)):
        plt.plot(kmeans.cluster_centers_[i][0:12], marker='o', label='Cluster %s' % i)
    plt.xticks(range(0, len(df_data.columns[0:12])), df_data.columns[0:12], fontsize=15)
    plt.legend(fontsize=10)
    plt.show()

if cluster_var == True and cluster_avg == False:
    plt.figure(figsize=(15, 4))
    for i in range(0, len(kmeans.cluster_centers_)):
        plt.plot(kmeans.cluster_centers_[i][0:12], marker='o', label='Cluster %s' % i)
    plt.xticks(range(0, len(df_data.columns[0:12])), df_data.columns[0:12], fontsize=15)
    plt.legend(fontsize=10)
    plt.show()

if cluster_var == True and cluster_avg == True:
    plt.figure(figsize=(15, 4))
    for i in range(0, len(kmeans.cluster_centers_)):
        plt.plot(kmeans.cluster_centers_[i][0:12], marker='o', label='Cluster %s' % i)
    plt.xticks(range(0, len(df_data.columns[0:12])), df_data.columns[0:12], fontsize=15)
    plt.legend(fontsize=10)
    plt.show()

    plt.figure(figsize=(15, 4))
    for i in range(0, len(kmeans.cluster_centers_)):
        plt.plot(kmeans.cluster_centers_[i][12:24], marker='o', label='Cluster %s' % i)
    plt.xticks(range(0, len(df_data.columns[12:24])), df_data.columns[12:24], fontsize=15)
    plt.legend(fontsize=10)
    plt.show()



In [None]:
# Decomment to check centroids in table
#df_city.groupby("cluster_kmeans").agg({"cluster_kmeans":"count", "AverageTemperature": "mean", '1_avg': "mean", '2_avg': "mean", '3_avg': "mean", '4_avg': "mean", '5_avg': "mean", '6_avg': "mean", '7_avg': "mean", '8_avg': "mean", '9_avg': "mean", '10_avg': "mean", '11_avg': "mean", '12_avg': "mean"}).sort_values(by="AverageTemperature", ascending=False).round(2).rename(columns={"cluster_kmeans": "cluster size"})

In [None]:
# Decomment to check centroids in table
#df_city.groupby("cluster_kmeans").agg({"cluster_kmeans":"count", '1_var': "mean", '2_var': "mean", '3_var': "mean", '4_var': "mean", '5_var': "mean", '6_var': "mean", '7_var': "mean", '8_var': "mean", '9_var': "mean", '10_var': "mean", '11_var': "mean", '12_var': "mean"}).sort_values(by="AverageTemperatureUncertainty", ascending=False).round(2).rename(columns={"cluster_kmeans": "cluster size"})

#### Geographical visualization of the clusters

In [None]:
px.scatter_geo(df_city, lat="Latitude", lon="Longitude", color="cluster_kmeans").show()

# Time series Cluster

K-means on feature related to temperature variance across the month failed to give well validated clusters as it happened with the average temperature. So we tried to use Dynamic Time Warping distance to try finding pattern between timeseries in case there were problem related with a misalignment.

However even this result where quite similar to the euclidean distance.

In [None]:
ts_moths_average_temp_var = to_time_series_dataset([i for _,i in df_moths_average_temp_var.iterrows()])

In [None]:
km = TimeSeriesKMeans(n_clusters=3, metric="dtw",  random_state=0)
km.fit(ts_moths_average_temp_var)

In [None]:
plt.figure(figsize=(15, 4))
for i in range(0, len(km.cluster_centers_)):
    plt.plot(km.cluster_centers_[i], marker='o', label='Cluster %s' % i)
plt.xticks(range(0, len(features_var)), features_var, fontsize=15)
plt.legend(fontsize=10)
plt.show()

In [None]:
df_city["cluster_kmeans_dtw"] = km.labels_.astype(str)

#### Geographical visualization of the clusters

In [None]:
px.scatter_geo(df_city, lat="Latitude", lon="Longitude", color="cluster_kmeans_dtw").show()