In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import folium
import numpy as np

In [None]:
operations = pd.read_csv("../input/world-war-ii/operations.csv")
weather = pd.read_csv("../input/weatherww2/Summary of Weather.csv")
locations = pd.read_csv("../input/weatherww2/Weather Station Locations.csv")

In [None]:
operations.head()

In [None]:
weather.describe()

In [None]:
locations.head()

**Data Preprocessing**

In [None]:
operations = operations[pd.isna(operations.Country) == False]
operations = operations[pd.isna(operations['Target Longitude']) == False]
operations = operations[pd.isna(operations['Takeoff Longitude']) == False]

drop_list = ['Mission ID','Unit ID','Target ID','Altitude (Hundreds of Feet)','Airborne Aircraft',
             'Attacking Aircraft', 'Bombing Aircraft', 'Aircraft Returned',
             'Aircraft Failed', 'Aircraft Damaged', 'Aircraft Lost',
             'High Explosives', 'High Explosives Type','Mission Type',
             'High Explosives Weight (Pounds)', 'High Explosives Weight (Tons)',
             'Incendiary Devices', 'Incendiary Devices Type',
             'Incendiary Devices Weight (Pounds)',
             'Incendiary Devices Weight (Tons)', 'Fragmentation Devices',
             'Fragmentation Devices Type', 'Fragmentation Devices Weight (Pounds)',
             'Fragmentation Devices Weight (Tons)', 'Total Weight (Pounds)',
             'Total Weight (Tons)', 'Time Over Target', 'Bomb Damage Assessment','Source ID']
operations.drop(drop_list,axis = 1,inplace = True)
operations = operations[ operations.iloc[:,8]!="4248"] # drop this takeoff latitude 
operations = operations[ operations.iloc[:,9]!=1355]  

In [None]:
operations.info()

In [None]:
locations.info()

In [None]:
locations = locations[['WBAN','NAME','STATE/COUNTRY ID','Latitude','Longitude']]
locations.info()

In [None]:
weather.info()

In [None]:
weather = weather[["STA","Date","MeanTemp"]]
weather.info()

In [None]:
weather.head()

In [None]:
operations.head()

**Data Visualization**

In [None]:
#How many country which attacks
counts = operations.Country.value_counts()
print(counts)
plt.figure(figsize = (22,10))
sns.countplot(operations.Country)
plt.show()

In [None]:
#top 10 Aircraft Series
print(operations['Aircraft Series'].value_counts()[0:10])
plt.figure(figsize = (22,10))
sns.countplot(operations['Aircraft Series'])
plt.show()

In [None]:
#top target countries
print(operations['Target Country'].value_counts()[0:10])
plt.figure(figsize=(22,10))
sns.countplot(operations['Target Country'])
plt.xticks(rotation = 90)
plt.show()

In [None]:
map = folium.Map(location=[0,0],zoom_start = 4,tiles = 'Stamen Terrain')

for index,row in operations.iterrows():
    try:
        fg = folium.map.FeatureGroup()
        fg.add_child(folium.CircleMarker(
        [float(row['Takeoff Longitude']),float(row['Takeoff Latitude'])],
        radius = 5,
        color = 'red',
        fill_color = 'red'
        ))

        map.add_child(fg)
        folium.Marker([float(row['Takeoff Longitude']),float(row['Takeoff Latitude'])],
        popup = row["Takeoff Location"]).add_to(map)
    except:
        continue
map

* Lets focus USA and BURMA war
* In this war USA bomb BURMA( KATHA city) from 1942 to 1945.
* The closest weather station to this war is BINDUKURI and it has temperature record from 1943 to 1945.
* Now lets visualize this situation. But before visualization, we need to make date features date time object.

In [None]:
weather.head()

In [None]:
weather_station_id = locations[locations.NAME == 'BINDUKURI']
weather_bin = weather[weather.STA == 32907]
weather_bin['Date'] = pd.to_datetime(weather_bin['Date'])
plt.figure(figsize=(22,10))
plt.plot(weather_bin.Date,weather_bin.MeanTemp)
plt.show()

**Stationarity Check**

* As you can see from plot above, our time series has seasonal variation. In summer, mean temperature is higher and in winter mean temperature is lower for each year.
* Now lets check stationary of time series. We can check stationarity using the following methods:
1.  Plotting Rolling Statistics: We have a window lets say window size is 6 and then we find rolling mean and variance to check stationary.
1. Dickey-Fuller Test: The test results comprise of a Test Statistic and some Critical Values for difference confidence levels. If the test statistic is less than the critical value, we can say that time series is stationary.

In [None]:
timeseries = weather_bin[["Date","MeanTemp"]]
timeseries.index = timeseries.Date
ts = timeseries.drop("Date",axis=1)

In [None]:
# adfuller library 
from statsmodels.tsa.stattools import adfuller
def check_adfuller(ts):
    #dickey fullar test
    result = adfuller(ts,autolag='AIC')
    print('Test Statistics:',result[0])
    print('P Value',result[1])
    print('Critical Value',result[4])
#check mean std
def check_mean_std(ts):
    #rolling statistics
    rolmean = pd.rolling_mean(ts,window=6)
    rolstd = pd.rolling_std(ts,window=6)
    plt.figure(figsize=(22,10))
    orgi = plt.plot(ts,color='red',label='Original')
    mean = plt.plot(rolmean,color = 'black',label='Rolling Mean')
    std = plt.plot(rolstd,color='green',label='Rolling STD')
    plt.xlabel("Date")
    plt.ylabel("Mean Temprature")
    plt.title("Rolling Mean & Standard Deviation")
    plt.legend()
    plt.show()
    
check_mean_std(ts)
check_adfuller(ts.MeanTemp)
    

**Make a Time Series Stationary?**
* As we mentioned before, there are 2 reasons behind non-stationarity of time series
1. Trend: varying mean over time. We need constant mean for stationary of time series.
2. Seasonality: variations at specific time. We need constant variations for stationary of time series.
* First solve trend(constant mean) problem
1. Most popular method is moving average.
2. Moving average: We have window that take the average over the past 'n' sample. 'n' is window size.

In [None]:
window_size = 6
moving_avg = pd.rolling_mean(ts,window_size)
plt.figure(figsize=(22,10))
plt.plot(ts,color='red',label='Original')
plt.plot(moving_avg,color='black',label="moving avg mean")
plt.title("Mean Temperature of Bindukuri Area")
plt.xlabel("Date")
plt.ylabel("Mean Temperature")
plt.legend()
plt.show()

In [None]:
ts_moving_avg_diff = ts - moving_avg
ts_moving_avg_diff.dropna(inplace=True)
check_mean_std(ts_moving_avg_diff)
check_adfuller(ts_moving_avg_diff.MeanTemp)

* Constant mean criteria: mean looks like constant as you can see from plot(black line) above . (yes stationary)
Second one is constant variance. It looks like constant. (yes stationary)
The test statistic is smaller than the 1% critical values so we can say with 99% confidence that this is a stationary series. (yes stationary)
* **We achieve stationary time series. However lets look at one more method to avoid trend and seasonality.
Differencing method: It is one of the most common method. Idea is that take difference between time series and shifted time series.**

In [None]:
ts_diff = ts - ts.shift()
plt.figure(figsize=(22,10))
plt.plot(ts_diff)
plt.title("Differencing method") 
plt.xlabel("Date")
plt.ylabel("Differencing Mean Temperature")
plt.show()

In [None]:
ts_diff.dropna(inplace=True)
check_mean_std(ts_diff)
check_adfuller(ts_diff.MeanTemp)

Forecasting a Time Series
We learn two different methodsthat are moving average and differencing methods to avoid trend and seasonality problem
* For prediction(forecasting) we will use ts_diff time series that is result of differencing method. There is no reason I only choose it.
* Also prediction method is ARIMA that is Auto-Regressive Integrated Moving Averages.
* AR: Auto-Regressive (p): AR terms are just lags of dependent variable. For example lets say p is 3, we will use x(t-1), x(t-2) and x(t-3) to predict x(t)
* I: Integrated (d): These are the number of nonseasonal differences. For example, in our case we take the first order difference. So we pass that variable and put d=0
* MA: Moving Averages (q): MA terms are lagged forecast errors in prediction equation.
* (p,d,q) is parameters of ARIMA model.
In order to choose p,d,q parameters we will use two different plots.
* Autocorrelation Function (ACF): Measurement of the correlation between time series and lagged version of time series.
* Partial Autocorrelation Function (PACF): This measures the correlation between the time series and lagged version of time series but after eliminating the variations already explained by the interveni

In [None]:
from statsmodels.tsa.stattools import acf, pacf
lag_acf = acf(ts_diff,nlags=20)
lag_pacf = pacf(ts_diff,nlags=20,method='ols')
# ACF
plt.figure(figsize=(22,10))

plt.subplot(121) 
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_diff)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')

# PACF
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_diff)),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.tight_layout()

* Two dotted lines are the confidence interevals. We use these lines to determine the ‘p’ and ‘q’ values
* Choosing p: The lag value where the PACF chart crosses the upper confidence interval for the first time. p=1.
* Choosing q: The lag value where the ACF chart crosses the upper confidence interval for the first time. q=1.
* Now lets use (1,0,1) as parameters of ARIMA models and predict
* ARIMA: from statsmodels libarary
* * datetime: we will use it start and end indexes of predict method

In [None]:
# ARIMA LİBRARY
from statsmodels.tsa.arima_model import ARIMA
from pandas import datetime
#Model Training
model = ARIMA(ts,order=(1,0,1))
model_fit = model.fit(disp=0)
#Model Testing
intial_index = datetime(1944,6,25)
end_index = datetime(1945,5,31)
forcast = model_fit.predict(start = intial_index,end=end_index)

#visualization

plt.figure(figsize=(22,10))
plt.plot(weather_bin.Date,weather_bin.MeanTemp,label = "Original")
plt.plot(forcast,label="Predicted")
plt.title("Time Series Forecast")
plt.xlabel("Date")
plt.ylabel("Mean Temperature")
plt.legend()
plt.show()