# Time series analysis

In [None]:
import pandas as pd

# KMEANS
from sklearn.cluster import KMeans
from yellowbrick.cluster.elbow import KElbowVisualizer 
from yellowbrick.cluster import silhouette_visualizer 

# Visualization
import plotly.express as px
import plotly.io as pio
pd.options.plotting.backend = "plotly"
pio.templates.default = "seaborn"
import matplotlib.pyplot as plt
# geo (visualization)

pd.set_option('display.max_columns', None)

In [None]:
# Read datasets
df_temperature = pd.read_csv("./datasets/CityGlobalTemperature2000-2009.csv", index_col=0) # index_col takes the index from the csv rather than creating it automatically (i.e. unnamed col is removed)

In [None]:
df_temperature.head()

## Data Anlaysis

Null values and Type analysis

In [None]:
df_temperature.info()

I verify that each city has the same Country, Latitude and Longitude in all its records.

In [None]:
something_different = False

for city in df_temperature.City.unique():
    query = df_temperature['City'] == city

    if not df_temperature[query].loc[:, 'Country'].eq(df_temperature[query].loc[:, 'Country'].iloc[0]).all():
        something_different = True
        print(city + ' has different Country')
    if not df_temperature[query].loc[:, 'Latitude'].eq(df_temperature[query].loc[:, 'Latitude'].iloc[0]).all():
        something_different = True
        print(city + ' has different Latitude')
    if not df_temperature[query].loc[:, 'Longitude'].eq(df_temperature[query].loc[:, 'Longitude'].iloc[0]).all():
        something_different = True
        print(city + ' has different Longitude')

if not something_different:
    print('Each city has the same Country, Latitude and Longitude')

Studying the uncertanty related to average temperature for each city

In [None]:
df_temperature.sort_values(by='AverageTemperatureUncertainty')

In [None]:
df_city = df_temperature.groupby('City').agg('mean')
df_city = df_city.join(df_temperature.groupby('City').Country.first())
df_city = df_city.join(df_temperature.groupby('City').Latitude.first())
df_city = df_city.join(df_temperature.groupby('City').Longitude.first())

px.scatter_3d(df_city, x = 'Longitude', y='Latitude', z='AverageTemperatureUncertainty', color='Country')

## Data Transformation and Feature Engegneering

Creating 12 new attributes related to the average temperature that occured in each month across all the years.

In [None]:
df_temperature['year'] = pd.DatetimeIndex(df_temperature['time']).year
df_temperature['month'] = pd.DatetimeIndex(df_temperature['time']).month

df_moths_average_temp = df_temperature.set_index('City').pivot_table('AverageTemperature', 'City', 'month').rename_axis(None, axis=1)#.reset_index(drop=True)
df_moths_average_temp

df_city = df_city.join(df_moths_average_temp, on='City')
df_city

Transforming latitude and longitude from String to int

In [None]:
# conversion functions
def latitude_to_float(latitude):

    if(latitude[-1] == 'N'):
        return float(latitude[0:-1])
    elif(latitude[-1] == 'S'):
        return - float(latitude[0:-1])
    else:
        print('Conversion error: ' + latitude)

def longitude_to_float(longitude):

    if(longitude[-1] == 'W'):
        return - float(longitude[0:-1])
    elif(longitude[-1] == 'E'):
        return float(longitude[0:-1])
    else:
        print('Conversion error: ' + longitude)


# applying conversions
df_city.Latitude = df_city.Latitude.apply(lambda x: latitude_to_float(x))
df_city.Longitude = df_city.Longitude.apply(lambda x: longitude_to_float(x))
df_city

In [None]:
df_city.info()

## Cluster

In [None]:
features = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

df_data = df_city[features].reset_index(drop=True)
df_data

In [None]:
model = KMeans(n_init=10, max_iter=100, init="k-means++")
sse_visualizer = KElbowVisualizer(model, k=(2,30), timings=False)
sse_visualizer.fit(df_data)
sse_visualizer.show()

sil_visualizer = KElbowVisualizer(model, k=(2,30), timings=False, metric="silhouette")
sil_visualizer.fit(df_data)
sil_visualizer.show()

In [None]:
optimal_k = sse_visualizer.elbow_value_
kmeans = KMeans(n_clusters=optimal_k, n_init=10, max_iter=100, init="k-means++")
kmeans.fit(df_data)

df_city["cluster_kmeans"] = kmeans.labels_.astype(str)

x = silhouette_visualizer(KMeans(optimal_k, random_state=42), df_data)
print("The silhoutte score is: " + str(x.silhouette_score_))

In [None]:
plt.figure(figsize=(15, 4))
for i in range(0, len(kmeans.cluster_centers_)):
    plt.plot(kmeans.cluster_centers_[i], marker='o', label='Cluster %s' % i)
plt.xticks(range(0, len(df_data.columns)), df_data.columns, fontsize=15)
plt.legend(fontsize=10)
plt.show()

In [None]:
df_city.groupby("cluster_kmeans").agg({"cluster_kmeans":"count", "AverageTemperature": "mean", 1: "mean", 2: "mean", 3: "mean", 4: "mean", 5: "mean", 6: "mean", 7: "mean", 8: "mean", 9: "mean", 10: "mean", 11: "mean", 12: "mean"}).sort_values(by="AverageTemperature", ascending=False).round(2).rename(columns={"cluster_kmeans": "cluster size"})

In [None]:
px.scatter_geo(df_city, lat="Latitude", lon="Longitude", color="cluster_kmeans").show()

In [None]:
px.scatter_mapbox(df_city, lat="Latitude", lon="Longitude", color="cluster_kmeans", zoom=1, mapbox_style="open-street-map").show()