In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install chart_studio

In [None]:
import seaborn as sns # visualization library
import matplotlib.pyplot as plt # visualization library
import chart_studio.plotly as py # visualization library
from plotly.offline import init_notebook_mode, iplot # plotly offline mode
init_notebook_mode(connected=True) 
import plotly.graph_objs as go # plotly graphical object

import os
print(os.listdir("../input"))
import warnings        
warnings.filterwarnings("ignore")
plt.style.use('ggplot') 

In [None]:
df = pd.read_csv('../input/world-war-ii/operations.csv')
df_loc = pd.read_csv('../input/weatherww2/Weather Station Locations.csv')
df_weather = pd.read_csv('../input/weatherww2/Summary of Weather.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df = df[df.Country.notnull()]

In [None]:
df = df[df['Target Longitude'].notnull()]
df = df[df['Takeoff Longitude'].notnull()]

In [None]:
df.shape

In [None]:
drop_columns = ['Mission ID','Unit ID','Target ID','Altitude (Hundreds of Feet)','Airborne Aircraft',
             'Attacking Aircraft', 'Bombing Aircraft', 'Aircraft Returned',
             'Aircraft Failed', 'Aircraft Damaged', 'Aircraft Lost',
             'High Explosives', 'High Explosives Type','Mission Type',
             'High Explosives Weight (Pounds)', 'High Explosives Weight (Tons)',
             'Incendiary Devices', 'Incendiary Devices Type',
             'Incendiary Devices Weight (Pounds)',
             'Incendiary Devices Weight (Tons)', 'Fragmentation Devices',
             'Fragmentation Devices Type', 'Fragmentation Devices Weight (Pounds)',
             'Fragmentation Devices Weight (Tons)', 'Total Weight (Pounds)',
             'Total Weight (Tons)', 'Time Over Target', 'Bomb Damage Assessment','Source ID']

df.drop(drop_columns, axis=1, inplace=True)
df = df[df.iloc[:,8]!="4248"] # drop this takeoff latitude 
df = df[df.iloc[:,9]!=1355]   # drop this takeoff longitude

In [None]:
df.head()

In [None]:
# Extract required columns from the other two dataframes

df_loc = df_loc.loc[:, ["WBAN","NAME","STATE/COUNTRY ID","Latitude","Longitude"]]
df_weather = df_weather.loc[:, ["STA", "Date", "MeanTemp"]]

In [None]:
df.info()

In [None]:
df_loc.info()

In [None]:
df_weather.info()

<h2>Exploratory data analysis</h2>

In [None]:
plt.figure(figsize=(15,10))
print(df['Country'].value_counts())
sns.countplot(df['Country'])
plt.title('Frequency of each country in the dataset')
plt.show()

In [None]:
# Top target countries
print(df['Target Country'].value_counts()[:10])
sample_df = df['Target Country'].value_counts()[:10]
plt.figure(figsize=(15, 10))
sns.barplot(x=sample_df.index, y=sample_df.values)
plt.xlabel('Frequency')
plt.ylabel('Country')
plt.show()

In [None]:
# Producing the result for aircraft series using plotly
sample_df = df['Aircraft Series'].value_counts()[:10]
print(sample_df)
data = go.Bar(x=sample_df.index, y=sample_df.values, hoverinfo='text', marker=dict(color='rgba(123, 234, 213, 0.5)', line=dict(color='rgb(0,0,0)', width=1.2)))

layout = dict(title='Frequency of aircraft series in dataset')

fig=go.Figure(data=data, layout=layout)
iplot(fig)

Now let us visualize the take-off bases. A country can contain a take-off base of another country. We will visualize each separate country with a color.

In [None]:
df.Country.unique()

In [None]:
df['color'] = ""
df.color[df.Country == "USA"] = "rgb(0,116,217)"
df.color[df.Country == "GREAT BRITAIN"] = "rgb(255,65,54)"
df.color[df.Country == "NEW ZEALAND"] = "rgb(133,20,75)"
df.color[df.Country == "SOUTH AFRICA"] = "rgb(255,133,27)"

In [None]:
# Defining the plot attributes and marker point attributes
data = dict(type='scattergeo', lon=df['Takeoff Longitude'], lat=df['Takeoff Latitude'],
           hoverinfo='text',
           text = "Country: "+df.Country+" TakeOff Location: "+df['Takeoff Location']+" Takeoff Base: "+df['Takeoff Base'],
           mode='markers',
           marker=dict(
               sizemode='area',
               sizeref=1,
               size=10,
               line=dict(width=1, color='white'),
               color=df['color'],
               opacity = 0.7))

# Defining the layout 
# This includes - the title, hovering and the map attributes
layout = dict(title='Takeoff bases of each country',
             hovermode='closest',
             geo=dict(showframe=False, showland=True, showcoastlines=True,showcountries=True,
                     countrywidth=1, projection=dict(type='mercator'),
                     landcolor='rgb(200, 200, 200)',
                     subunitwidth=1,
                     showlakes=True,
                     lakecolor='rgb(255, 255, 255)',
                     countrycolor='rgb(5, 5, 5)')
             )
fig = go.Figure(data=data, layout=layout)
fig.update_layout(
    autosize=False,
    width=800,
    height=800)
iplot(fig)

Now we will go for a more complex plot. We will map out the path from the take-off base, to the target location.

In [None]:
df.iloc[0, 8], df.iloc[0, 15]

In [None]:
df[:20]

In [None]:
df.iloc[:, 9]

In [None]:
df.iloc[0, 16]

In [None]:
# Plotting the take-off areas for bombing
bombers = [dict(type='scattergeo',
              lon=df['Takeoff Longitude'],
              lat=df['Takeoff Latitude'],
              hoverinfo='text',
              text = "Country: "+df.Country+" TakeOff Location: "+df['Takeoff Location']+" Takeoff Base: "+df['Takeoff Base'],
              mode='markers', marker=dict(size=5, color=df['color'], line = dict(width=1, color='white')))
          ]
           
# Plotting the attacked cities

targets = [dict(type='scattergeo',
              lon=df['Target Longitude'],
              lat=df['Target Latitude'],
              hoverinfo='text',
              text='Target country: '+df['Target Country']+' Target City: '+df['Target City'],
              mode='markers',
              marker=dict(
              size=1,
              color='red',
              line=dict(width=0.5, color='red')))
          ]

# Mapping out a path from take-off point to attacked city
path = []
# For each row in the dataset
for i in range(len(df['Target Longitude'])):
    path.append(
        dict(
            type='scattergeo',
            # Takeoff longitute -> Target longitude
            lon=[df.iloc[i, 9], df.iloc[i, 16]],
            # Takeoff latitude -> Target latitude
            lat=[df.iloc[i, 8], df.iloc[i, 15]],
            mode='lines',line=dict(width=0.7,color='black'),
            opacity=0.6
        )
    )
    
layout = dict(title='Bombing path from Bunker to Target',
             hovermode='closest',
             geo=dict(showframe=False, showland=True, showcoastlines=True,showcountries=True,
                     countrywidth=1, projection=dict(type='mercator'),
                     landcolor='rgb(200, 200, 200)',
                     subunitwidth=1,
                     showlakes=True,
                     lakecolor='rgb(255, 255, 255)',
                     countrycolor='rgb(5, 5, 5)')
             )

fig = dict(data=path+bombers+targets, layout=layout)
iplot(fig)

In [None]:
print(df['Theater of Operations'].value_counts())
plt.figure(figsize=(15, 5))
sns.countplot(df['Theater of Operations'])
plt.title('Frequency of operations')
plt.show()

In [None]:
# Location of weather stations
data = dict(type='scattergeo',
           lon=df_loc.Longitude,
           lat = df_loc.Latitude,
           hoverinfo='text',
           text='Name: '+df_loc.NAME+" Country: "+df_loc['STATE/COUNTRY ID'],
           mode='markers',
           marker=dict(
               sizemode='area',
               sizeref=1,
               size=8,
               line=dict(width=1, color='white'),
               color='blue',
               opacity=0.7
           )
        )

layout = dict(title='Weather Station locations', hovermode='closest',
             geo=dict(showframe=False, showland=True, showcoastlines=True,showcountries=True,
                     countrywidth=1, projection=dict(type='mercator'),
                     landcolor='rgb(200, 200, 200)',
                     subunitwidth=1,
                     showlakes=True,
                     lakecolor='rgb(255, 255, 255)',
                     countrycolor='rgb(5, 5, 5)')
             )
fig = go.Figure(data=data, layout=layout)
fig.update_layout(
    autosize=False,
    width=800,
    height=800)
iplot(fig)

Let us focus on the US-BURMA war. USA managed to bomb Katha City in Burma from 1942-1945. The closest weather station to that location is BINDUKURI and has a temperature record from 1943 to 1945. 

Let us visualize the situation, after converting the relevant features into datetime objects

In [None]:
df_loc_id = df_loc[df_loc.NAME=='BINDUKURI'].WBAN
df_loc_id

In [None]:
bin_weather = df_weather[df_weather.STA==32907]
bin_weather['Date'] = pd.to_datetime(bin_weather['Date'])

In [None]:
plt.figure(figsize=(15,10))
plt.plot(bin_weather.Date, bin_weather.MeanTemp)
plt.title('Mean temperature for Bindukuri')
plt.xlabel('Date')
plt.ylabel('Mean temperature')
plt.show()

We can see the temperature measurements from 1943 to 1945. The temperature oscillates  between 12-32 degrees. The winter month temperatures are lower than that of the summer months (as expected).

In [None]:
dff = pd.read_csv('../input/world-war-ii/operations.csv')
print(dff.shape)
dff['Mission Date'] = pd.to_datetime(dff['Mission Date'], format=r'%m/%d/%Y')
dff['year'] = dff['Mission Date'].dt.year
dff['month'] = dff['Mission Date'].dt.month
print(dff.shape)
dff = dff[dff["year"]>=1943]
dff = dff[dff["month"]>=8]
print(dff.shape)

In [None]:
dff['Mission Date'].isnull().sum()

In [None]:
dff = dff[dff['year']>=1943]
dff = dff[dff['month']>=8]

In [None]:
dff['month']

In [None]:
dff['month'].unique()

In [None]:
dff.shape

In [None]:
attack="USA"
target="BURMA"
city = "KATHA"

# We now have a dataset pertaining to all the dates when USA bombed Katha
dff_s = dff[dff.Country==attack]
dff_s = dff_s[dff_s['Target Country']==target]
dff_s = dff_s[dff_s['Target City']==city]

In [None]:
dff_s.shape

In [None]:
l = []
a = []
for each in dff_s['Mission Date']:
    # Extract the BIN (closest weather station to Katha) dataframe w.r.t the date
    d = bin_weather[bin_weather.Date==each]
    # Extract the temperature
    l.append(d['MeanTemp'].values)

dff_s['vals'] = l
for each in dff_s.vals.values:
    # Add all temperatures to the list
    # Converting it from a list of arrays into a list
    a.append(each[0])

In [None]:
l[:10]

In [None]:
a[:10]

In [None]:
trace = go.Scatter(
    x = bin_weather.Date,
    mode='lines',
    y=bin_weather.MeanTemp,
    marker=dict(color='rgba(12, 124, 32, 0.5)'),
    name='Mean temperature'
)

trace1 = go.Scatter(
    x = dff_s['Mission Date'],
    mode='markers',
    y=a,
    marker=dict(color='rgba(123,43,1, 0.9)'),
    name='Bombing temperature'
)

layout = dict(title='Mean temperatures at the weather temperature alongside bombings')
data = [trace, trace1]
fig = dict(data=data, layout=layout)
iplot(fig)

<h2>Time series predictions</h2>

We will be using the ARIMA model - AutoRegressive Integrated Moving average.

**What is a time series** - It is a collection of data points collected at constant time intervals, which is time dependent. Most of the time series have a form of seasonality trend - for eg - Ice cream sales will be higher in summers than in winters. 

**Stationarity of time series** - There are three basic criterion for a time series to understand whether it is stationary or not. 

* Constant mean
* Constant variance
* Auto-covariance does not depend on time. It is covariance between time series and lagged time series.

We can visualize and check seasonality trends of our time series

In [None]:
plt.figure(figsize=(22, 10))
plt.plot(bin_weather.Date, bin_weather.MeanTemp)
plt.title("Mean temperature for the Bindukuri Area")
plt.xlabel("Date")
plt.ylabel("Mean temperature")
plt.show()

tS = bin_weather.loc[:, ['Date', 'MeanTemp']]
tS.index = tS.Date
tSeries = tS.drop('Date', axis=1)

Through the plot, we can see that the time series has seasonal variation. The mean is higher in the summer while lower in the winters. We will now check the stationarity using the following methods

* **Plotting rolling-window stats** - We have a window with say window size 6, we find the rolling mean and variance to check stationary.

* **Dickey-Fuller test** - The test comprises of a test statistic and some critical values for different confidence levels. If the test statistic is less than the critical value, we say that the time series is stationary.

In [None]:
from statsmodels.tsa.stattools import adfuller
def adf_check(ts):
    res = adfuller(ts, autolag='AIC')
    print('Test statistic', res[0])
    print('P-value', res[1])
    print('Critical Value', res[4])
    
def mean_std(ts):
    rolling_mean = ts.rolling(6).mean()
    rolling_std = ts.rolling(6).std()
    plt.figure(figsize=(22, 10))
    orig = plt.plot(ts, color='red', label='Time series')
    mean = plt.plot(rolling_mean, color='Black', label='Rolling mean')
    std = plt.plot(rolling_std, color='Green', label='Rolling STD')
    plt.xlabel('Date')
    plt.ylabel('Mean temperature')
    plt.title('Rolling mean and standard deviation')
    plt.legend()
    plt.show()
    
adf_check(tSeries)
# Our index is the date
mean_std(tSeries.MeanTemp)

Now let us go through our results in detail

* Our first critera for stationarity is a constant mean. We can see from above that our mean shifts between a big range. It is not constant.

* The second criteria is constant variance. Our variance in the model looks nearly constant (we can assume it to be).

* Third criteria is that our test statistic is lower than the critical value, as is the principle for hypothesis testing. Our test staistic is bigger that all of the aforementioned critical values (from 1% to 10%) therefore it is not stationary

All these point towards our time series being non-stationary, we will convert it to be so.

<h2>Converting a time series to be stationary</h2>

There are two major reasons behind non-stationarity of time series

* Trend - Varying mean over time. We need a constant mena
* Seasonality - Variations at a specific time, we need constant variations (i.e. constant std) for stationarity of time series.


To solve the constant mean problem - We can take moving averages. We have a window that takes the average over the past x samples. Here x is our window size.

In [None]:
moving_avg_ts = tSeries.rolling(6).mean()
plt.figure(figsize=(15, 10))
orig = plt.plot(tSeries, color='red', label='Time series')
mean = plt.plot(moving_avg_ts, color='Black', label='Rolling mean')
plt.xlabel('Date')
plt.ylabel('Mean temperature')
plt.title('Mean Temp and Rolling mean')
plt.legend()
plt.show()

In [None]:
moving_avg_ts_diff = tSeries-moving_avg_ts
moving_avg_ts_diff.dropna(inplace=True)

adf_check(moving_avg_ts_diff)
mean_std(moving_avg_ts_diff)

We can see that the mean looks pretty constant now. The variance is also in the same boat.
The test statistic is now lesser than 1% critical values, implying that we can say with 99% confidence that the given series is stationary.

We have achieved stationarity with our time series. We also have one more method to avoid trend and seasonality that is the **Differencing method**. It takes a difference between a time series and a shifted time series.

In [None]:
tSeries_diff = tSeries-tSeries.shift()
plt.figure(figsize=(15, 10))
plt.plot(tSeries_diff)
plt.title('Differencing the time series')
plt.xlabel('Date')
plt.ylabel('Differencing mean temp')
plt.show()

In [None]:
tSeries_diff.dropna(inplace=True) # first value
adf_check(tSeries_diff)
mean_std(tSeries_diff.MeanTemp)

We can see that our mean value is approximately constant, alongside the variance. The test statistic is similar to the one predicted above, we can say with 99% confidence that we have a stationary series.

<h2>Forecasting a time series</h2>

Now that we've covered two different techniques to avoid the trend and seasonality problem. We will now look at modeling our predictions. We will be using the time-series created by the differencing method


**The ARIMA model** i.e. the auto-regressive integrated moving averages model consists of the following parts:
* AR: Auto-regressive(p): These terms are just lags of dependent variables. Let us say p=3, then we will use x(t-1), x(t-2) and x(t-3) i.e. three terms to predict x(t)
* I: Integrated (d): Number of non-seasonal differences. We take the first-order difference, we pass that variable and put d=0.
* MA: Moving averages (q): MA terms are the lagged forecast errors in the prediction equation.

**P, D and Q are parameters of the ARIMA model**. To choose there parameters, we will use two different plots:
* Autocorrelation function (ACF): Measurement of correlation between time series and lagged version of time series
* Partial autocorrelation function (PACF): Measurement of correlation between time series and lagged versions of time series, but after eliminating the variations already explained by the intervening comparisons.

In [None]:
from statsmodels.tsa.stattools import acf, pacf
acf_lag = acf(tSeries_diff, nlags=20)
pacf_lag = pacf(tSeries_diff, nlags=20, method='ols')
plt.figure(figsize=(22, 10))

plt.subplot(121)
plt.plot(acf_lag)
plt.axhline(y=0, linestyle='--', color='gray')
plt.axhline(y=-1.96/np.sqrt(len(tSeries_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(tSeries_diff)),linestyle='--',color='gray')
plt.title('Autocorrelation function')

plt.subplot(122)
plt.plot(pacf_lag)
plt.axhline(y=0, linestyle='--', color='gray')
plt.axhline(y=-1.96/np.sqrt(len(tSeries_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(tSeries_diff)),linestyle='--',color='gray')
plt.title('Partial autocorrelation function')
plt.tight_layout()
plt.show()

We hjave two dotted lines that refer to the confidence intervals. We use them to determine the P and Q values
* P - The lag value where the PACF chart crosses upper confidence interval for first time - p=1

* Q - The lag value where the ACF chart crosses the upper confidence interval for the first time q=1

Now wee will use the configuration (1, 0, 1) as parameters of ARIMA model and predict

In [None]:
from statsmodels.tsa.arima_model import ARIMA
from pandas import datetime

# fit model
model = ARIMA(tSeries, order=(1,0,1)) # (ARMA) = (1,0,1)
model_fit = model.fit(disp=0)

start_index = datetime(1944, 6, 25)
end_index = datetime(1945, 5, 31)
forecast = model_fit.predict(start=start_index, end=end_index)

# visualization
plt.figure(figsize=(22,10))
plt.plot(bin_weather.Date,bin_weather.MeanTemp,label = "original")
plt.plot(forecast,label = "predicted")
plt.title("Time Series Forecast")
plt.xlabel("Date")
plt.ylabel("Mean Temperature")
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error
# fit model
model2 = ARIMA(tSeries, order=(1,0,1)) # (ARMA) = (1,0,1)
model_fit2 = model2.fit(disp=0)
forecast2 = model_fit2.predict()
error = mean_squared_error(tSeries, forecast2)
print("error: " ,error)
# visualization
plt.figure(figsize=(22,10))
plt.plot(bin_weather.Date,bin_weather.MeanTemp,label = "original")
plt.plot(forecast2,label = "predicted")
plt.title("Time Series Forecast")
plt.xlabel("Date")
plt.ylabel("Mean Temperature")
plt.legend()
plt.savefig('graph.png')

plt.show()