# **In-Depth Climate Change Trend Analysis**
Raw Dataset Name: Daily Temperature of Major Cities<br>
Raw Dataset Link: https://www.kaggle.com/sudalairajkumar/daily-temperature-of-major-cities

In [None]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import seaborn as sns
from matplotlib.pyplot import plot
from matplotlib import pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from pandas.tseries.offsets import *

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("../input/daily-temperature-of-major-cities/city_temperature.csv")
df = df[(df['Year'] >= 1995) & (df['Year'] < 2020)].reset_index().drop(columns = ['index']) # Extract data 1995 - 2019
df = df[(df['AvgTemperature'] != -99)].reset_index().drop(columns = ['index']) # Drop invalid data
df['AvgTemperature'] = (df['AvgTemperature'] - 32) * (5/9) # Fahrenheit to Celsius
df

In [None]:
df['AvgTemperature'].isnull().values.any()

In [None]:
# Yearly Temperature Dataframe
avgTemp_Reg = pd.DataFrame(df.groupby(['Year', 'Region'])['AvgTemperature'].mean()).reset_index() # Average Temperature by Year and Region
avgTemp_Reg.index = pd.to_datetime(avgTemp_Reg['Year'], format='%Y', errors='coerce').dropna().drop(columns = ['Year']) # Convert to datetime index
avgTemp_Reg = avgTemp_Reg.drop(columns = ['Year']) # Drop unnecessary column
avgTemp_Reg

In [None]:
# Yearly Average Temperature Trend by Continental (1995 - 2019)
uniqueReg = pd.DataFrame(avgTemp_Reg.Region.unique(), columns=['Region'])
flg, ax = plt.subplots(1, figsize=(10,25))
plt.title('Yearly Average Temperature Trend by Continental (1995 - 2019)')
plt.xlabel('Year')
plt.ylabel('Degree (Celsius)')
plt.grid()

row = 0
for v in range(len(uniqueReg)):
    globals()['reg%s' % v] = np.where(uniqueReg['Region'][row] == avgTemp_Reg['Region'])
    row += 1
    for value in globals()['reg%s' % v]:
        globals()['reg%s' % v] = avgTemp_Reg.iloc[value, :].drop(columns=['Region'])
        globals()[uniqueReg.iloc[v, 0]] = globals()['reg%s' % v]
        plot(globals()[uniqueReg.iloc[v, 0]], label=uniqueReg.iloc[v, 0], marker='^')

plt.legend(loc=2, ncol=2, prop={'size': 10}, frameon=True, shadow=True)
plt.show()

# **- Overall Preview for All Continentals**
Middle East has significantly surpassed its global warming trend, topping the all-time year avg high in 2018.<br>
The rising trend seems worth noting, therefore I decided to dive into Middle East region exclusively.<br>
Below are to determine the "main source" that has caused the region to go up that high.

# **- Middle East Climate Change Analysis**

In [None]:
middleEast = df[df['Region'] == 'Middle East']
middleEast = middleEast.groupby(['Country', 'Year'])['AvgTemperature'].mean().reset_index()
middleEast

In [None]:
middleEast['IncreaseP'] = None
# Calculate Increase(Decrease) Rate per year
for v in range(len(middleEast)):
    if middleEast.iloc[v, 1] == 1995:
        middleEast['IncreaseP'][v] = None

    else:
        if middleEast.iloc[v, 1] - middleEast.iloc[v-1, 1] == 1: # Year Validation
            if middleEast.iloc[v, 0] == middleEast.iloc[v-1, 0]: # Country Validation
                middleEast['IncreaseP'][v] = round(((middleEast.iloc[v, 2] - middleEast.iloc[v-1, 2]) / middleEast.iloc[v-1, 2]) * 100, 2).astype(float)
            else:
                print(v) # Print Error Line for Invalid Country Match

        elif middleEast.iloc[v-1, 1] == 1995: # Adjustment for Missing Year
            middleEast['IncreaseP'][v] = round(((middleEast.iloc[v, 2] - middleEast.iloc[v-1, 2]) / middleEast.iloc[v-1, 2]) * 100, 2).astype(float)

        else:
            print(v) # Print Error Line for Invalid Year Match
            
middleEast

In [None]:
print(middleEast[middleEast['Country'] == 'Turkey']['IncreaseP'].mean())
middleEast[middleEast['Year'] == 2018].sort_values(by='IncreaseP', ascending=False)

In [None]:
middleEast[middleEast['Country'] == 'Turkey'].sort_values(by='AvgTemperature', ascending=False).head(5)

# - Preview for Middle East<br>
Combined with several validation above, it turns out "Turkey" is the main cause that led "Middle East" to top the peak in 2018.<br>
Accordingly, Turkey's climate change has become severe since 2010s.<br>
<br><br>
I'll be carefully looking at Turkey for a few more steps.

In [None]:
# Turkey's Climate Change
turkey = df[df['Country'] == 'Turkey'].reset_index().drop(columns = ['index'])
turkey_by_month = pd.pivot_table(data=turkey, index='Month', columns='Year', values='AvgTemperature', aggfunc='mean')
turkey_by_month

In [None]:
# Heatmap Plot
plt.figure(figsize=(15,6))
sns.heatmap(data=turkey_by_month, cmap='coolwarm', annot=True, fmt=".1f", annot_kws={'fontsize':12})
plt.title("Turkey Climate Change")
plt.tight_layout()

In [None]:
# Middle East Climate Change for References
middleEast_by_month = pd.pivot_table(data=df[df['Region'] == 'Middle East'], index='Month', columns='Year', values='AvgTemperature', aggfunc='mean')

plt.figure(figsize=(15,6))
sns.heatmap(data=middleEast_by_month, cmap='coolwarm', annot=True, fmt='.1f', annot_kws={'fontsize': 12})
plt.title('Middle East Climate Change')
plt.tight_layout()

In [None]:
# Middle East Fall Season Trend Line
middle_east_month_avg = pd.DataFrame(df[df['Region'] == 'Middle East']).groupby(['Year', 'Month'])['AvgTemperature'].mean().reset_index()

fig, ax = plt.subplots()
fig.set_size_inches(15, 5)
ax = plt.subplot(1, 3, 1)
ax2 = plt.subplot(1, 3, 2)
ax3 = plt.subplot(1, 3, 3)
middle_east_sep = middle_east_month_avg[middle_east_month_avg['Month'] == 9].drop(columns = 'Month').set_index(['Year'])
middle_east_oct = middle_east_month_avg[middle_east_month_avg['Month'] == 10].drop(columns = 'Month').set_index(['Year'])
middle_east_nov = middle_east_month_avg[middle_east_month_avg['Month'] == 11].drop(columns = 'Month').set_index(['Year'])

sns.regplot(x=middle_east_sep.index, y=middle_east_sep.AvgTemperature, data=middle_east_sep, fit_reg=True, ax=ax).set_title('Sep');
sns.regplot(x=middle_east_oct.index, y=middle_east_oct.AvgTemperature, data=middle_east_oct, fit_reg=True, ax=ax2).set_title('Oct');
sns.regplot(x=middle_east_nov.index, y=middle_east_nov.AvgTemperature, data=middle_east_nov, fit_reg=True, ax=ax3).set_title('Nov');
sns.despine()

# **- Ongoing Climate Change of Fall Season in Middle East**
> > The heatmaps and  the trend line above tell us a lot more;<br>Turkey's fall season, and hence, the overall Middle East's fall season, (Sep - Nov) is getting warmer and warmer.

In [None]:
# Plot Correlation Matrix for Certain Region
# corrDataFrame creates Correlation Matrix Dataframe for Region
def corrDataFrame(region):
    # Create Dummy Dataframe for the Region
    dummy_data = pd.pivot_table(data=df[df['Region'] == region], index=['Year'], columns=['Country'], values=['AvgTemperature'], aggfunc='mean')
    dummy_data['Region Avg'] = pd.DataFrame(df[df['Region'] == region].groupby(['Year'])['AvgTemperature'].mean())
    dummy_data = dummy_data.fillna(dummy_data.mean())

    # Create Correlation Matrix Dataframe for the Region
    a = dummy_data.corr().reset_index()
    a.columns = a.columns.droplevel(0) # Remove Multiindex
    
    # Rename Column Names
    dummy = a.columns.to_series()
    dummy.iloc[1] = 'Country'
    dummy.iloc[-1] = str(region) + ' Avg'
    a.columns = dummy
    
    # Set Country Column as Index
    a = a.iloc[:, 1:]
    a.loc[a.tail(1).index, 'Country'] = str(region) + ' Avg'
    a.set_index('Country', inplace=True)
    a = round(a, 2)
    
    # Delete Dummy Data
    del dummy
    del dummy_data

    return a


# corrMatrix plots Correlation Matrix for Region
def corrMatrix(region):
    plt.figure(figsize=(15,15))
    fig = sns.heatmap(region, annot=True)
    fig.set_xlabel('')
    fig.set_ylabel('')
    plt.title('Correlation Matrix')
    # Show Correlation Matrix
    plt.show()

In [None]:
middleEast_Corr = corrDataFrame('Middle East')
corrMatrix(middleEast_Corr)

# **- Summary of Middle East**
Correlation Matrix above shows that most countries have a strong proportional relationship with Middle East.<br>
In other words, All countries but Oman and Israel were getting warmer and, in return, Middle East have become one of the hottest continentals.

Oman has been actually getting cooler, but it lacks observation data in some years where the data have been filled with mean values.

In [None]:
# Delete Unused Dataframe for Memory Allocation
del uniqueReg
del middleEast
del turkey_by_month
del middleEast_by_month
del middle_east_month_avg
del middleEast_Corr

# **- South/Central America (Carribean) Climate Change Analysis**

In [None]:
# Create Region-Specific DataFrame & Calculate Growth Rate per Year
def growth_rate_by_region(region):
    dummy = df[df['Region'] == region]
    dummy = dummy.groupby(['Country', 'Year'])['AvgTemperature'].mean().reset_index()
    dummy['IncreaseP'] = None
    
    for v in range(len(dummy)):
        if dummy.iloc[v, 1] == 1995:
            dummy['IncreaseP'][v] = None
        
        else:
            # For when year's in order
            if dummy.iloc[v, 1] - dummy.iloc[v-1, 1] == 1: # Year Validation
                if dummy.iloc[v, 0] == dummy.iloc[v-1, 0]: # Country Validation
                    dummy['IncreaseP'][v] = round(((dummy.iloc[v, 2] - dummy.iloc[v-1, 2]) / dummy.iloc[v-1, 2]) * 100, 2).astype(float)
                
                else:
                    print(v) # Print error line for possible invalid country match
                    print(pd.DataFrame(dummy.iloc[v:v+3, 0:2])) # Show data near error line
                    
            # For when year's not in order, or when there's a missing year in-between
            elif dummy.iloc[v-1, 1] == 1995: # Adjustment for missing year
                dummy['IncreaseP'][v] = round(((dummy.iloc[v, 2] - dummy.iloc[v-1, 2]) / dummy.iloc[v-1, 2]) * 100, 2).astype(float)
                
            # For when there's an invalid year data
            else:
                print(v) # Print error line for possible invalid year match
                print(pd.DataFrame(dummy.iloc[v:v+3, 0:2])) # Show data near error line

    return dummy

In [None]:
saGrowthRate = growth_rate_by_region('South/Central America & Carribean')
saGrowthRate.sample(6) # South/Central America & Carribean DataFrame with Yearly Growth Rate

Function shows three possible invalid data: 315th, 326th, 331st row.<br><br>

315th and 326th rows show Guyana have incosistent yearly data.<br>
331st row shows Haiti data has been collected since 1999.

In [None]:
saGrowthRate[saGrowthRate['Country'] == 'Guyana'] # Data Integrity Check for Guyana

In [None]:
saGrowthRate[saGrowthRate['Country'] == 'Haiti'] # Data Integrity Check for Haiti

Both countries have some missing years, none of which result in invalid output though.<br>
Each incosistent year is removed from calculation by setting "None" value.<br><br>

We're good to go!

In [None]:
saGrowthRate.sort_values(by='IncreaseP', ascending=False).head(10)

Unlike Middle East, information is mixed and hard to distinguish the main factor(country).<br>
Therefore, we'll be looking at correlation matrix to find out which country has the most correlation.

In [None]:
sa_Corr = corrDataFrame('South/Central America & Carribean')
corrMatrix(sa_Corr)

In [None]:
# Top 5 Correlation Countries
pd.DataFrame(sa_Corr.iloc[:, -1]).sort_values(by='South/Central America & Carribean Avg', ascending=False).head(6)

In [None]:
# Plot Yearly Temperature Trend of Top 3 Correlation Countries and South America
ax = plt.subplot(1, 4, 1)
ax2 = plt.subplot(1, 4, 2)
ax3 = plt.subplot(1, 4, 3)
ax4 = plt.subplot(1, 4, 4)

saGrowthRate[saGrowthRate['Country'] == 'Colombia'].plot(x='Year', y='AvgTemperature', legend=None, ax=ax, figsize=(20, 5), title='Colombia');
saGrowthRate[saGrowthRate['Country'] == 'Equador'].plot(x='Year', y='AvgTemperature', legend=None, ax=ax2, figsize=(20, 5), title='Equador');
saGrowthRate[saGrowthRate['Country'] == 'Dominican Republic'].plot(x='Year', y='AvgTemperature', legend=None, ax=ax3, figsize=(20, 5), title='Dominican Republic');
saGrowthRate.groupby(['Year'])['AvgTemperature'].mean().plot(x='Year', y='AvgTemperature', legend=None, ax=ax4, figsize=(20, 5), title='South America Avg');

# **- Overview of South/Central America & Carribean Continental**
According to the graphs above, even though Colombia has the highest correlation,<br>
it turns out Equador and Dominican Republic have more dramatic trends that lead South America continental to follow the uprising trend.<br><br>
**Therefore, we'll be exclusively looking at Equador and Dominican Republic<br>
so that we understand which factor has caused the extreme.**

In [None]:
# Monthly Trend Analysis
def monthly_pivot(region):
    dummy = df[df['Country'] == region].reset_index().drop(columns = ['index'])
    monthly_pivot = pd.pivot_table(data=dummy, index='Month', columns='Year', values='AvgTemperature', aggfunc='mean')
    return monthly_pivot

colombiaMonthly = monthly_pivot('Colombia')
colombiaMonthly

In [None]:
# Heatmap Plot
def heatmap_show(region, arg):
    plt.figure(figsize=(15,6))
    sns.heatmap(data=region, cmap='coolwarm', annot=True, fmt=".1f", annot_kws={'fontsize':12})
    plt.title(str(arg) + " Climate Change")
    plt.tight_layout()

heatmap_show(colombiaMonthly, 'Colombia')

In [None]:
# Equador Monthly Trend
equadorMonthly = monthly_pivot('Equador')
equadorMonthly

In [None]:
# Equador Heatmap
heatmap_show(equadorMonthly, 'Equador')

# **- Equador Monthly Temperature Trend**<br><br>
Regardless of month, every single season suddenly got so much warmer since 2013.<br>
I'd like to look at it more deeply, by comparing cities to determine if the global warming trend applies nation-wide,<br>
or it's just some cities that have driven the nation that much hot.

# - What Has Caused Equador To Be That Hot?

To answer the question, we'd have to look at city by city and determine the source.

In [None]:
equador_cities = df[df['Country'] == 'Equador'].groupby(['City', 'Year', 'Month'])['AvgTemperature'].mean().reset_index()
equador_cities

In [None]:
# Prior to analysis, make sure all cities have equal observances
print(df[df['Country'] == 'Equador']['City'].unique())

In [None]:
df[df['City'] == 'Guayaquil'].shape

In [None]:
df[df['City'] == 'Quito'].shape

It looks like Quito has some missing values.<br>
Therefore, I'll make a function that shows how many missing values there are in each city in each year/month.

In [None]:
from calendar import monthrange

# Missing Value Counts
def find_missing_values_country_all(country):
    country = df[df['Country'] == country][['Country', 'City', 'Month', 'Day', 'Year', 'AvgTemperature']]
    cities = country['City'].unique()
    
    city_list = []
    year_list = []
    month_list = []
    diff_list = []
    
    for city in range(len(cities)):        
        city = str(cities[city])
        
        for year in range(1995, 2020):
        
            for month in range(1, 13):
                monthVal = len(country[(country['City'] == city) & (country['Year'] == year) & (country['Month'] == month)])
                monthRange = int(monthrange(year, month)[1])
                diff = monthRange - monthVal
            
                city_list.append(city)
                year_list.append(year)
                month_list.append(month)
                diff_list.append(diff)
            
    missing_value_count = pd.DataFrame(list(zip(city_list, year_list, month_list, diff_list)), columns = ['City', 'Year','Month', 'Missing Value Count'])
    return missing_value_count

In [None]:
equador_missing_values = find_missing_values_country_all('Equador')
equador_missing_values

In [None]:
equador_missing_values[(equador_missing_values['City'] == 'Guayaquil') &  (equador_missing_values['Missing Value Count'] > 5)].sort_values(by='Missing Value Count', ascending=False)

As for Guayaquil,<br>
except for August 2004, the data looks promising that it has only few missing values.<br>
The global warming trend for Equador seems legitimate.

But it is possible that the other city, Quito, has more missing values.

In [None]:
equador_missing_values[(equador_missing_values['City'] == 'Quito') &  (equador_missing_values['Missing Value Count'] > 5)].sort_values(by='Missing Value Count', ascending=False)

Oops, Quito does not have any data in certain year/month.<br>
Let's find out since when it starts lacking observances.

In [None]:
equador_missing_values[equador_missing_values['City'] == 'Quito'].groupby(['Year'])['Missing Value Count'].sum()

Quito has very few data since 2013, and it does not have any data since 2015.<br>
Because it literally does not have data, **filling up with mean IS NOT A GOOD IDEA.**<br><br>
Considering that Equador has only two cities (Guayaquil, Quito), and the temperature significantly went up since 2013,<br>
it is possible that it's because of invalid data, and lack of data for Quito has caused the misunderstanding.

In [None]:
# Compare Each City's Monthly Average Temperature BEFORE Year 2013
df[(df['City'] == 'Guayaquil') & (df['Year'] < 2013)].groupby(['Month'])['AvgTemperature'].mean()

In [None]:
df[(df['City'] == 'Quito') & (df['Year'] < 2013)].groupby(['Month'])['AvgTemperature'].mean()

# Lack Of Data For Quito Is The Cause!
- Quito has ZERO/VERY FEW data since 2013.
- Guayaquil is always hotter place than Quito.
- As Guayaquil is THE ONLY source since 2013, the country's average temperature goes insanely high, which is an absolute misunderstanding!<br><br>
Exactly, Guayaquil is always a hotter place than Quito.<br>
We have to manipulate the data to figure out a more promising output.<br><br>
Therefore, I'll calculate growth rate of monthly temperature in Guayaquil,<br>
then multiply by that number for corresponding monthly data in Quito to fill up the missing values.<br>
After that, I'll look into the new Equador data and see if it's still getting noticeably warmer.

# - Data Adjustment For Quito And Revisualize The Trend

In [None]:
# 1. Calculate Growth Rate of Monthly Temperature in Guayaquil
guayaquil_monthly = df[(df['City'] == 'Guayaquil') & (df['Year'] >= 2012)].reset_index().drop(columns = ['index'])
guayaquil_monthly = pd.pivot_table(data=guayaquil_monthly, index='Month', columns='Year', values='AvgTemperature', aggfunc='mean')
guayaquil_monthly

In [None]:
monthly_growth_rate = []

for v in range(0, len(guayaquil_monthly.columns)-1):
    li = (guayaquil_monthly.iloc[:, v+1] - guayaquil_monthly.iloc[:, v]) / guayaquil_monthly.iloc[:, v]
    monthly_growth_rate.append(li)
    
guayaquil_growth_rate = pd.DataFrame(monthly_growth_rate).T
guayaquil_growth_rate.columns = range(2013, 2020)
guayaquil_growth_rate

In [None]:
# 2. Apply Growth Rate of Temperature in Guayaquil to Quito
quito_monthly = df[(df['City'] == 'Quito')].reset_index().drop(columns = ['index'])
quito_monthly = pd.pivot_table(data=quito_monthly, index='Month', columns='Year', values='AvgTemperature', aggfunc='mean')
quito_monthly

In [None]:
quito_monthly_2013 = quito_monthly.iloc[:, 17:]
quito_monthly_2013['2015'] = None
quito_monthly_2013['2016'] = None
quito_monthly_2013['2017'] = None
quito_monthly_2013['2018'] = None
quito_monthly_2013['2019'] = None

for v in range(1, len(quito_monthly_2013.columns)):
    quito_monthly_2013.iloc[:, v] =  quito_monthly_2013.iloc[:, v-1] * (1 + guayaquil_growth_rate.iloc[:, v-1])

quito_monthly_2013

In [None]:
# Create A New Quito Dataframe with Adjusted Values
quito_monthly_new = pd.concat((quito_monthly,quito_monthly_2013),sort=False).groupby(level=0).last()
quito_monthly_new

In [None]:
# Show Heatmap for Quito
heatmap_show(quito_monthly_new, 'Quito')

In [None]:
# 3. Create A New Equador Country Dataframe Using Updated Values
guayaquil_monthly_all = df[(df['City'] == 'Guayaquil')].reset_index().drop(columns = ['index'])
guayaquil_monthly_all = pd.pivot_table(data=guayaquil_monthly_all, index='Month', columns='Year', values='AvgTemperature', aggfunc='mean')

quito_monthly_new.columns = guayaquil_monthly_all.columns # Match column names
equadorMonthly_new = pd.DataFrame(columns = equadorMonthly.columns, index = equadorMonthly.index)
equadorMonthly_new = (guayaquil_monthly_all + quito_monthly_new) / 2
equadorMonthly_new

In [None]:
# Validation
print(((guayaquil_monthly_all.iloc[3, 18] + quito_monthly_new.iloc[3, 18]) / 2) == equadorMonthly_new.iloc[3, 18])

In [None]:
# 4. Now Visualize New Equador Dataset for Analysis
heatmap_show(equadorMonthly_new, 'Equador')

In [None]:
# Before Adjustment
ax = plt.subplot(1, 2, 1)
saGrowthRate[saGrowthRate['Country'] == 'Equador'].plot(x='Year', y='AvgTemperature', legend=None, ax=ax, figsize=(20, 5), title='Equador Before Adjustment');

# After Adjustment
ax2 = plt.subplot(1, 2, 2)
equadorMonthly_new.mean().plot(legend=None, ax=ax2, figsize=(20, 5), title='Equador After Adjustment');

After adjustment of missing values, Equador's weather doesn't seem to be going up that dramatically.<br>
I'll also adjust South America's trend in total later.

# - Data Adjustment For Entire South America And Revisualize The Trend

I'll be looking at all cities below to find out possible misleading countries.

In [None]:
### Caution: This method is so heavy that it takes some time to load ###

# Missing Value Counts for Entire Region
def find_missing_values_region(region):
    countries = df[df['Region'] == region]['Country'].unique()   
    region = pd.DataFrame(columns = ['Country', 'City', 'Year', 'Month', 'Missing Value Count'])
  
    for country in range(len(countries)):
        dummy = find_missing_values_country_all(countries[country])
        dummy['Country'] = str(countries[country])
        
        region = region.append(dummy)
        
    return region

In [None]:
south_america_all = find_missing_values_region('South/Central America & Carribean')

In [None]:
south_america_all.groupby('Country')['Missing Value Count'].sum().sort_values(ascending = False)

**- Preview of Data Adjustment for Entire South America Region<br><br>**
Total amount of days from 1995 to 2019 is 9,191 days.<br>
I'll be looking at the countries that have missing values more than 10% of the total.<br>
That'll be Guyana, Bermuda, Haiti, Equador(which is already done), Suriname, and Peru.

- 1. Guyana

In [None]:
south_america_all[(south_america_all['Country'] == 'Guyana')].groupby('Year')['Missing Value Count'].sum()

Gayana literally have only few data.
