In [1]:
import pandas as pd

In [4]:
df = pd.read_csv("city_temperature.csv", low_memory=False)
df.head()

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
0,Africa,Algeria,,Algiers,1,1,1995,64.2
1,Africa,Algeria,,Algiers,1,2,1995,49.4
2,Africa,Algeria,,Algiers,1,3,1995,48.8
3,Africa,Algeria,,Algiers,1,4,1995,46.4
4,Africa,Algeria,,Algiers,1,5,1995,47.9


In [6]:
df.describe(include="all")

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
count,2906327,2906327,1455337,2906327,2906327.0,2906327.0,2906327.0,2906327.0
unique,7,125,52,321,,,,
top,North America,US,Texas,Springfield,,,,
freq,1556681,1455337,129711,18530,,,,
mean,,,,,6.469163,15.71682,2006.624,56.00492
std,,,,,3.456489,8.800534,23.38226,32.12359
min,,,,,1.0,0.0,200.0,-99.0
25%,,,,,3.0,8.0,2001.0,45.8
50%,,,,,6.0,16.0,2007.0,62.5
75%,,,,,9.0,23.0,2013.0,75.5


### Drop wrong values

In [18]:
df_clean = df.copy()
print(df_clean.shape)
df_clean = df_clean[df_clean["Day"] >= 1]
print(df_clean.shape)
df_clean = df_clean[df_clean["Year"] >= 1990]
print(df_clean.shape)

(2906327, 9)
(2906319, 9)
(2905879, 9)


### Lag Features

In [19]:

df_clean['AvgTemperature_Lag1'] = df_clean['AvgTemperature'].shift(1)

In [20]:
df_clean.head()

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature,AvgTemperature_Lag1
0,Africa,Algeria,,Algiers,1,1,1995,64.2,
1,Africa,Algeria,,Algiers,1,2,1995,49.4,64.2
2,Africa,Algeria,,Algiers,1,3,1995,48.8,49.4
3,Africa,Algeria,,Algiers,1,4,1995,46.4,48.8
4,Africa,Algeria,,Algiers,1,5,1995,47.9,46.4


### Day of the year

In [22]:
df_clean["Date"] = pd.to_datetime(df_clean[["Year", "Month", "Day"]])

In [23]:
df_clean['DayOfYear'] = df_clean['Date'].dt.dayofyear

In [24]:
df_clean.head()

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature,AvgTemperature_Lag1,Date,DayOfYear
0,Africa,Algeria,,Algiers,1,1,1995,64.2,,1995-01-01,1
1,Africa,Algeria,,Algiers,1,2,1995,49.4,64.2,1995-01-02,2
2,Africa,Algeria,,Algiers,1,3,1995,48.8,49.4,1995-01-03,3
3,Africa,Algeria,,Algiers,1,4,1995,46.4,48.8,1995-01-04,4
4,Africa,Algeria,,Algiers,1,5,1995,47.9,46.4,1995-01-05,5


### Add season

In [26]:
df_clean['Season'] = (df_clean['Date'].dt.month % 12 + 3) // 3

In [27]:
df_clean.head()

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature,AvgTemperature_Lag1,Date,DayOfYear,Season
0,Africa,Algeria,,Algiers,1,1,1995,64.2,,1995-01-01,1,1
1,Africa,Algeria,,Algiers,1,2,1995,49.4,64.2,1995-01-02,2,1
2,Africa,Algeria,,Algiers,1,3,1995,48.8,49.4,1995-01-03,3,1
3,Africa,Algeria,,Algiers,1,4,1995,46.4,48.8,1995-01-04,4,1
4,Africa,Algeria,,Algiers,1,5,1995,47.9,46.4,1995-01-05,5,1


### Moving averages

In [30]:
df_clean['AvgTemperature_MovingAvg3'] = df_clean['AvgTemperature'].rolling(window=3).mean()

In [31]:
df_clean.head()

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature,AvgTemperature_Lag1,Date,DayOfYear,Season,AvgTemperature_MovingAvg3
0,Africa,Algeria,,Algiers,1,1,1995,64.2,,1995-01-01,1,1,
1,Africa,Algeria,,Algiers,1,2,1995,49.4,64.2,1995-01-02,2,1,
2,Africa,Algeria,,Algiers,1,3,1995,48.8,49.4,1995-01-03,3,1,54.133333
3,Africa,Algeria,,Algiers,1,4,1995,46.4,48.8,1995-01-04,4,1,48.2
4,Africa,Algeria,,Algiers,1,5,1995,47.9,46.4,1995-01-05,5,1,47.7


### Fourier transform

In [34]:
from scipy.fft import fft
import numpy as np
df_clean['Temperature_Fourier'] = np.abs(fft(df_clean['AvgTemperature'].values))

In [35]:
df_clean.head()

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature,AvgTemperature_Lag1,Date,DayOfYear,Season,AvgTemperature_MovingAvg3,Temperature_Fourier
0,Africa,Algeria,,Algiers,1,1,1995,64.2,,1995-01-01,1,1,,162813000.0
1,Africa,Algeria,,Algiers,1,2,1995,49.4,64.2,1995-01-02,2,1,,1105384.0
2,Africa,Algeria,,Algiers,1,3,1995,48.8,49.4,1995-01-03,3,1,54.133333,4868335.0
3,Africa,Algeria,,Algiers,1,4,1995,46.4,48.8,1995-01-04,4,1,48.2,2850553.0
4,Africa,Algeria,,Algiers,1,5,1995,47.9,46.4,1995-01-05,5,1,47.7,6833932.0


### Trends

In [36]:
from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(df_clean['AvgTemperature'], model='additive', period=1)
df_clean['Trend'] = result.trend
df_clean['Seasonal'] = result.seasonal
df_clean['Residual'] = result.resid

In [38]:
df_clean.describe()

Unnamed: 0,Month,Day,Year,AvgTemperature,AvgTemperature_Lag1,Date,DayOfYear,Season,AvgTemperature_MovingAvg3,Temperature_Fourier,Trend,Seasonal,Residual
count,2905879.0,2905879.0,2905879.0,2905879.0,2905878.0,2905879,2905879.0,2905879.0,2905877.0,2905879.0,2905879.0,2905879.0,2905879.0
mean,6.468345,15.7167,2006.897,56.02882,56.02881,2007-05-24 01:33:16.223654656,181.4496,2.489588,56.02881,28672.28,56.02882,0.0,0.0
min,1.0,1.0,1995.0,-99.0,-99.0,1995-01-01 00:00:00,1.0,1.0,-99.0,12.36922,-99.0,0.0,0.0
25%,3.0,8.0,2001.0,45.8,45.8,2001-02-07 00:00:00,90.0,2.0,44.96667,14310.21,45.8,0.0,0.0
50%,6.0,16.0,2007.0,62.5,62.5,2007-03-25 00:00:00,180.0,2.0,61.83333,22700.35,62.5,0.0,0.0
75%,9.0,23.0,2013.0,75.5,75.5,2013-08-09 00:00:00,273.0,3.0,75.03333,33763.81,75.5,0.0,0.0
max,12.0,31.0,2020.0,110.0,110.0,2020-05-13 00:00:00,366.0,4.0,109.3667,162813000.0,110.0,0.0,0.0
std,3.456095,8.800545,7.27782,32.06836,32.06836,,105.6576,1.113804,29.51093,106247.2,32.06836,0.0,0.0
