In [None]:
!pip install pmdarima #step to avoid ModuleNotFoundError when importing libraries


In [None]:
!pip install dtaidistance
!pip install yellowbrick


In [None]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.dates as mdates
import matplotlib.cbook as cbook
import seaborn as sns
from scipy.stats import pearsonr
import squarify
#ML libraries
from statsmodels.tsa.stattools import adfuller
from pmdarima import auto_arima
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from math import sqrt
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Human losses
ru_losses_pers = pd.read_csv('../input/2022-ukraine-russian-war/russia_losses_personnel.csv')

# Equipment losses
ru_losses_equip = pd.read_csv('../input/2022-ukraine-russian-war/russia_losses_equipment.csv')



# loading Dataset


In [None]:
df_equipment=pd.read_csv('../input/2022-ukraine-russian-war/russia_losses_equipment.csv')
df_personnel=pd.read_csv('../input/2022-ukraine-russian-war/russia_losses_personnel.csv')


# Initial data check


In [None]:
#looking at our data
#df_equipment.head()
df_personnel.head(5)

# Merging Dataset

In [None]:
#The equipment data has missing values we need to treat
#the personnel data has a column with little to no value: 'personnel*'

#changing 'date' type to datetime for our timeseries study
df_equipment['date'] = pd.to_datetime(df_equipment['date'])
df_personnel['date'] = pd.to_datetime(df_equipment['date'])
#fill na (missing values)
df_equipment.fillna(value=0, inplace=True)
df_personnel.fillna(value=0, inplace=True)
#drop personnel* since it adds little value
df_personnel.drop(labels= ['personnel*','day'],axis=1, inplace=True)
#merging the two datasets 
df_losses = df_equipment.merge(df_personnel, how='inner',on='date')


In [None]:
#since we have personnel losses are cumulative, we are adding a column with daily/incremental personnel losses
df_losses['incremental_personnel_loss'] = df_losses['personnel'].diff(periods=1)
#setting day 1
df_losses['incremental_personnel_loss'] = df_losses['incremental_personnel_loss'].replace(df_losses['incremental_personnel_loss'][0],df_losses['personnel'][0])


In [None]:
#We also add a column with weeks since the start of the war which will allow us to explore its evolution on a weekly basis

df_losses['week'] = df_losses['day']//7


In [None]:

#df_losses
#looking at our clean and merged dataset
df_losses.head()


In [None]:
df_losses.info()

# Lets now check for null fields


In [None]:
#.columns
#Lets now check for null fields
import seaborn as sns
plt.figure(figsize=(10,10))

sns.heatmap(df_losses.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
df_losses.isnull().sum()


* No null value

# Duplicates VALUE 

In [None]:
#study the data
#df.hist()
# Duplicates VALUE 

print(f'Duplicates in the dataset: {df_losses.duplicated().sum()}')
print(f'Percentage of duplicates: {df_losses.duplicated().sum()/len(df_losses)*100}%')


* There are zero many duplicates, No need to be removed.


In [None]:
# data describtion
df_losses.describe().T.style.background_gradient()
 

In [None]:
#Data Types 
df_losses.dtypes


In [None]:
df_losses.isnull().sum()
#Cardinality 
df_losses.nunique()


# finding correlation


In [None]:
# heatmap graph for finding  correlation of column 
print("HEATMAP")
#get the correlation
sns.set_context('poster', font_scale=0.5)
plt.figure(figsize=(20,12))
cor = df_losses.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()


In [None]:
#Correlation with output variable
cor_target = abs(cor["MRL"])
#Selecting highly correlated features
relevant_features = cor_target #[cor_target>0.5]
relevant_features


# High Correlation Columns


In [None]:
relevant_features = cor_target [cor_target>0.85]
relevant_features


# High Correlation Columns Heatmap


In [None]:
plt.figure(figsize=(20,12))
sns.set_context('poster', font_scale=0.8)
sns.heatmap(df_losses.corr(),annot=True, cbar=False, cmap='Blues', fmt='.1f')


# Study The Dataset Column Contribution


In [None]:
#study the data
sns.set_context('poster', font_scale=0.5)
df_losses.hist(bins=25, grid=False, figsize=(25,18), color='#86bf91', zorder=2, rwidth=0.9)
plt.show()


In [None]:
# plt.close()
# sns.set_style('whitegrid')
# sns.pairplot(df_losses,hue='Year',height=4);
# plt.show()


# Repurposing the high value for improved visualisation

In [None]:
df_losses['personnel']=(df_losses['personnel']*0.4)

# Weekaly losses

In [None]:

df_losses.groupby('week').mean().plot(kind='bar', figsize=(25,15))  # visualizing our data first based on mean length and width 

In [None]:
import warnings   # To ignore any warnings 
warnings.filterwarnings("ignore")


In [None]:
sns.set(rc={'figure.figsize':(20,15)})
sns.distplot(df_losses['day'],hist=False)
sns.distplot(df_losses['aircraft'],hist=False)
sns.distplot(df_losses['helicopter'],hist=False)
sns.distplot(df_losses['drone'],hist=False)
plt.legend(['day','aircraft','helicopter','drone'])
plt.show()


In [None]:
sns.set(rc={'figure.figsize':(20,15)})
sns.distplot(df_losses['day'],hist=False)
sns.distplot(df_losses['tank'],hist=False)
sns.distplot(df_losses['APC'],hist=False)
sns.distplot(df_losses['anti-aircraft warfare'],hist=False)
sns.distplot(df_losses['special equipment'],hist=False)
sns.distplot(df_losses['anti-aircraft warfare'],hist=False)
plt.legend(['day','tank','APC','anti-aircraft warfare','special equipment','anti-aircraft warfare'])
plt.show()


In [None]:
sns.set(rc={'figure.figsize':(20,15)})
sns.distplot(df_losses['day'],hist=False)
sns.distplot(df_losses['naval ship'],hist=False)
sns.distplot(df_losses['cruise missiles'],hist=False)
sns.distplot(df_losses['anti-aircraft warfare'],hist=False)
plt.legend(['day','naval ship','cruise missiles','anti-aircraft warfare'])
plt.show()


In [None]:
# df_losses.plot(kind='density', subplots=True, layout=(4,3), sharex=False, 
#                      sharey=False,fontsize=12, figsize=(20,10))


# Total lossos with various weapons across Date


In [None]:
import plotly.express as px
px.line(df_losses, x='date', y='personnel')


In [None]:
px.line(df_losses, x='date', y='cruise missiles')


In [None]:
px.line(df_losses, x='date', y='aircraft')


In [None]:
px.line(df_losses, x='date', y='helicopter')


In [None]:
px.line(df_losses, x='date', y='tank')


In [None]:
px.line(df_losses, x='date', y='MRL')


In [None]:
px.line(df_losses, x='date', y='drone')


In [None]:
px.line(df_losses, x='date', y='mobile SRBM system')


In [None]:
px.line(df_losses, x='date', y='naval ship')


In [None]:
px.line(df_losses, x='date', y='incremental_personnel_loss')


In [None]:
px.line(df_losses, x='date', y='POW')


In [None]:
px.line(df_losses, x='date', y='special equipment')


In [None]:
df_losses.columns

# All losses

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=df_losses['date'], y=df_losses['special equipment'], mode='lines', name='special equipment'))
fig.add_trace(go.Scatter(x=df_losses['date'], y=df_losses['naval ship'], mode='lines', name='naval ship'))
fig.add_trace(go.Scatter(x=df_losses['date'], y=df_losses['drone'], mode='lines', name='drone'))
fig.add_trace(go.Scatter(x=df_losses['date'], y=df_losses['MRL'], mode='lines', name='MRL'))
fig.add_trace(go.Scatter(x=df_losses['date'], y=df_losses['tank'], mode='lines', name='tank'))
fig.add_trace(go.Scatter(x=df_losses['date'], y=df_losses['aircraft'], mode='lines', name='aircraft'))
fig.add_trace(go.Scatter(x=df_losses['date'], y=df_losses['helicopter'], mode='lines', name='helicopter'))
fig.add_trace(go.Scatter(x=df_losses['date'], y=df_losses['cruise missiles'], mode='lines', name='cruise missiles'))
fig.add_trace(go.Scatter(x=df_losses['date'], y=df_losses['anti-aircraft warfare'], mode='lines', name='anti-aircraft warfare'))
fig.add_trace(go.Scatter(x=df_losses['date'], y=df_losses['mobile SRBM system'], mode='lines', name='mobile SRBM system'))
fig.add_trace(go.Scatter(x=df_losses['date'], y=df_losses['APC'], mode='lines', name='APC'))


# Plotting Cumulative and Incremental Personnel Losses



In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

axes[0].plot(df_losses['date'], df_losses['incremental_personnel_loss'], "b.")
axes[1].plot(df_losses['date'], df_losses['personnel'])

for ax in axes:
    ax.xaxis.set_minor_locator(mdates.DayLocator(bymonthday=[25], interval=1, tz=None))
    ax.xaxis.set_major_locator(mdates.MonthLocator())
    ax.grid(True)

    
#adding a trendline to personnel losses
z = np.polyfit(mdates.date2num(df_losses['date']), df_losses['incremental_personnel_loss'], 3)
p = np.poly1d(z)
axes[0].plot(mdates.date2num(df_losses['date']),p(mdates.date2num(df_losses['date'])))

axes[0].set_title('Incremental daily Personnel losses', fontsize = 16)
axes[1].set_title('Cumulative Personnel losses', fontsize = 16)


* At first glance, the first days of the war have a considerable variance in daily personnel losses as data did not seem to be frequently updated: Some days report 0 incremental losses while others overcompensate with very high losses. The Incremental daily personnel losses trendline (blue line) indicate casualties tended to decrease until early April, then slightly picked up again.



In [None]:
fig,ax1 = plt.subplots(figsize= (10,10*0.618))


bp = sns.boxplot(data = df_losses, 
                     x = "week", 
                     y = "incremental_personnel_loss",
                     palette="crest",
                     ax = ax1)

ax2 = ax1.twinx()

lp = sns.lineplot(data = df_losses, 
                     x = "week", 
                     y = "personnel",
                     linewidth = 3,
                     ax=ax2)
ax1.set(xlabel='Weeks since the start of the war', ylabel='Daily personnel losses')
ax2.set(ylabel='')
ax1.set_title("Despite lacking personnel updates during the third week, \n most personnel losses happened during the first days of the war.", fontsize = 14);

* As the title of the box plot suggests, the earliest period of the war involved the most casualties for the invaders. While weeks 5 to 8 suggest lower casualties, from week 9 onwards casualties have been picking up in pace once again.



# Equipment Losses 


In [None]:
# Plot multiple lines
fig, ax = plt.subplots(figsize= (10,10*0.618))

num=0
for column in df_losses[['aircraft', 'helicopter', 'drone','cruise missiles']]:
    num+=1
    plt.plot(df_losses['date'], df_losses[column], marker='', linewidth=1, alpha=0.9, label=column)

# Add legend
plt.legend(loc=2, ncol=2)
 
# Add titles
plt.title("Air Equipment Losses", fontsize=20)
plt.xlabel("Time")
plt.ylabel("Losses")

# Show the graph
plt.show()


# Plotting Artillery, AA and Support/Logistic vehicles (land)



In [None]:
# Plot multiple lines
fig, ax = plt.subplots(figsize= (10,10*0.618))

num=0
for column in df_losses[['field artillery', 'MRL', 'military auto', 'fuel tank','mobile SRBM system', 'vehicles and fuel tanks']]:
    num+=1
    plt.plot(df_losses['date'], df_losses[column], marker='', linewidth=1, alpha=0.9, label=column)

# Add legend
plt.legend(loc=2, ncol=2)
 
# Add titles
plt.title("Artillery, AA and Support/Logistic vehicles losses", fontsize=20)
plt.xlabel("Time")
plt.ylabel("Losses")

# Show the graph
plt.show()


# Plotting Armoured Vehicles (land)


In [None]:
# Plot multiple lines
fig, ax = plt.subplots(figsize= (10,10*0.618))

num=0
for column in df_losses[['tank', 'APC']]:
    num+=1
    plt.plot(df_losses['date'], df_losses[column], marker='', linewidth=1, alpha=0.9, label=column)

# Add legend
plt.legend(loc=2, ncol=2)
 
# Add titles
plt.title("Armoured Vehicle losses", fontsize=20)
plt.xlabel("Time")
plt.ylabel("Losses")

# Show the graph
plt.show()


# Plotting Naval (sea)

In [None]:
# Plot multiple lines
fig, ax = plt.subplots(figsize= (10,10*0.618))

num=0
for column in df_losses[['naval ship']]:
    num+=1
    plt.plot(df_losses['date'], df_losses[column], marker='', linewidth=1, alpha=0.9, label=column)

# Add legend
plt.legend(loc=2, ncol=2)
 
# Add titles
plt.title("Naval losses", fontsize=20)
plt.xlabel("Time")
plt.ylabel("Losses")

# Show the graph
plt.show()


# Brief Comparison between Land, Air and Naval losses by absolute numbers



In [None]:
fig, ax = plt.subplots(figsize= (10,10*0.618))

#Getting series for air, land and sea
air_losses = df_losses[['aircraft', 'helicopter', 'drone','cruise missiles']].sum(axis=1)
land_armoured_losses = df_losses[['tank', 'APC']].sum(axis=1)
land_support_losses = df_losses[['field artillery', 'MRL', 'military auto', 'fuel tank','mobile SRBM system', 'vehicles and fuel tanks']].sum(axis=1)
sea_losses = df_losses[['naval ship']].sum(axis=1)

#Getting a dictionary of the last day losses for each of the types above
Casualty_dict = {
'Armoured Vehicles' : land_armoured_losses.tail(1).values,
'Artillery, AA and Support/Logistic vehicles' : land_support_losses.tail(1).values,
'Air' : air_losses.tail(1).values,
'Naval' : sea_losses.tail(1).values}
colors = ['foresTgreen','limegreen','tomato','navy']

Casualty_dict
# preparing a data fram for the treemap
df = pd.DataFrame(Casualty_dict)

# plot it
squarify.plot(sizes = list(Casualty_dict.values()), label = list(Casualty_dict.keys()), color=colors, alpha=.7, text_kwargs={'fontsize':14})
plt.axis('off')
plt.title("As seen by the shades of green, Land Equipment makes out most of the total equipment losses", fontsize=15)
plt.show()


# Ukrain Location

In [None]:
import matplotlib as mpl
import folium as F
from wordcloud import WordCloud, STOPWORDS
from math import log10, floor

# print('Matplotlib version: ', mpl.__version__)
# print(plt.style.available)
mpl.style.use(['ggplot'])


In [None]:
ukraine_latitude = 48.383022 
ukrain_longitude = 31.1828699
ukraine_map = F.Map(location=[ukraine_latitude, ukrain_longitude], tiles="Stamen Terrain", zoom_start=6.49)

kiev = F.map.FeatureGroup()

kiev_latitude =50.4500336
kiev_longitude =30.5241361
kiev.add_child(F.features.CircleMarker([kiev_latitude, kiev_longitude], radius=5, color="red", fill_color="blue"))
ukraine_map.add_child(kiev)

F.Marker([kiev_latitude, kiev_longitude], popup="Ukraine's Capital").add_to(ukraine_map)

ukraine_map
