In [None]:

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### This is my first attempt at using some of my coding skills for data visualisation and analysis. Migration is an area I am very interested in and I hope the visualisations below makes sense!

# Overview of Data 

In [None]:
missing_migrant = pd.read_csv('/kaggle/input/missing-migrants-project/MissingMigrants-Global-2019-12-31_correct.csv')

### 1. All the columns and types of information that are present in the dataset.

In [None]:
missing_migrant.info()

In [None]:
missing_migrant.head()

In [None]:
missing_migrant.describe()

### To obtain a summary of the objects which are left out of the previous describe function (which only includes floats and integers), run the code below as well. 

In [None]:
missing_migrant.describe(include= np.object)

## 2. Overview Graphs  

### Time period in which data was recorded for these graphs


In [None]:
missing_migrant['Reported Date'].head()

In [None]:
missing_migrant['Reported Date'].tail()

### This shows that the time period was from January 6, 2014 to December 30 2019.

## Number of incidents in each region

In [None]:
fig, ax= plt.subplots()
sns.countplot(x="Region of Incident", data=missing_migrant)
plt.title("Fig 2.1: Number of incidents recorded in each region", size = 14)
plt.xticks(rotation=90)
plt.show()

### The region with the **highest incidents is the US-Mexico Border** with 15 unique regions where incidents have been reported. Figure 1 above shows the number of incidents in each region between **January 06, 2014 to December 30, 2019**.

## Number Dead by regions

In [None]:
sns.relplot(x="Region of Incident",y="Number Dead", data=missing_migrant, kind="scatter", hue="Region of Incident")
plt.title("Fig 2.2: Number Dead per incident in each region", size = 14)
plt.xticks(rotation=90)
plt.show()

### Fig 2.2 shows us the number of dead per incident in the region. From this scatter graph, we can see that there was one single incident in the mediterranean that resulted in the death of over 700 migrants. The graph also highlights that there were single incidents in North America and Central Asia which was not captured in Fig 2.1.

## Number of Dead in each region of incident per year

In [None]:
g = sns.catplot(x="Region of Incident",y="Number Dead", data=missing_migrant,
           col="Reported Year",col_wrap=3,kind="bar",ci=None)
g.set_xticklabels(rotation=90)
g.fig.suptitle('Fig 2.3 Number of Dead in each region of incident per year')
g.fig.subplots_adjust(top=0.9)

### This shows us graphs with the number of dead in each region of incident per year. 

## Number Dead according to age and gender

In [None]:
deaths_children = missing_migrant.pivot_table(["Number of Children"],columns="Region of Incident", aggfunc=sum, fill_value=0)
deaths_children

In [None]:
deaths_children.plot(kind="bar", figsize= (5,5))
plt.ylabel("Total Number Dead")
plt.legend(loc="upper left", bbox_to_anchor=(1.0, 1.0), ncol=1)
plt.title("Fig 2.4 Total number children dead in each region")

SMALL_SIZE = 10
MEDIUM_SIZE = 14
BIGGER_SIZE = 16

plt.rc('font', size=MEDIUM_SIZE)          
plt.rc('axes', titlesize=BIGGER_SIZE)     
plt.rc('axes', labelsize=MEDIUM_SIZE)    
plt.rc('xtick', labelsize=SMALL_SIZE)    
plt.rc('ytick', labelsize=SMALL_SIZE)    
plt.rc('legend', fontsize=SMALL_SIZE)    
plt.rc('figure', titlesize=BIGGER_SIZE) 




In [None]:
missing_migrant.head()


In [None]:
deaths_by_gender= pd.melt(missing_migrant, 
                          id_vars=["Region of Incident"],
                          value_vars=["Number of Females", "Number of Males"],
                          var_name="Gender", 
                          value_name="Number of deaths").fillna(0).groupby(["Region of Incident","Gender"]).sum().reset_index()
deaths_by_gender

In [None]:
sns.barplot( x="Number of deaths",y="Region of Incident",hue="Gender",data=deaths_by_gender)
plt.title("Fig 2.5 Total Number Dead by gender in each region")
plt.xticks(rotation=90)
plt.show()

### Fig 2.5 shows that the there were more males dead in each region than females with the exception of Southeast Asia where there were more female deaths. 

## Causes of Death 

In [None]:
missing_migrant["Cause of Death"].describe()

### From the describe function, we can determine that the **top recorded cause of death is drowning** and there are **266 unique causes of deaths**. In order to be able to plot the different causes of death, we will need to group similar causes together. 


In [None]:
missing_migrant.loc[missing_migrant["Cause of Death"]=="Drowning"].count()

The total number of deaths by drowning are 1151.

In [None]:
missing_migrant["Cause of Death"]=missing_migrant["Cause of Death"].str.lower()
missing_migrant["Cause of Death"].unique()

### With these many unique causes of death, we will need to group them together into similar clusters in order to be able to visualise the causes of death. 

In [None]:
def compute_clustered_deaths (missing_migrant): 
    
    missing_migrant.loc[missing_migrant["Cause of Death"].str.contains("starvation|malnutrition"),
                        "Cause of Death"] = "Starvation"
    missing_migrant.loc[missing_migrant["Cause of Death"].str.contains("murdered|Violence|shot|stabbed|excessive phyiscal abuse")
                       ,"Cause of Death"] = "Violence"
    missing_migrant.loc[missing_migrant["Cause of Death"].str.contains("sexual abuse|rape|attacked by apache helicopter|killed by mortar shell"),
                       "Cause of Death"] = "Violence"
    missing_migrant.loc[missing_migrant["Cause of Death"].str.contains("violence while attempting to board boat|air strike|killed by landmine blast|killed by mortar shell"),
                       "Cause of Death"] = "Violence"
    missing_migrant.loc[missing_migrant["Cause of Death"].str.contains("artillery shells|violence"),
                       "Cause of Death"] = "Violence"
    missing_migrant.loc[missing_migrant["Cause of Death"].str.contains("crushed|drowning|asphyxiation|suffocation"), "Cause of Death"] = "Suffocation"
    missing_migrant.loc[missing_migrant["Cause of Death"].str.contains("hypothermia|hyperthermia|heat stroke|harsh conditions"), "Cause of Death"] = "Harsh Conditions"
    missing_migrant.loc[missing_migrant["Cause of Death"].str.contains("lightning|exposure|harsh weather|exhaustion"),"Cause of Death"] = "Harsh Conditions"
    missing_migrant.loc[missing_migrant["Cause of Death"].str.contains("unknown|mixed|gassed|rockslide"), "Cause of Death"] = "Others"
    missing_migrant.loc[missing_migrant["Cause of Death"].str.contains("vehicle|fall|fire|collision|train|bus|electrocution|hit by car|fell from truck|hit by truck|plane "), 
                        "Cause of Death"] = "Accident"
    missing_migrant.loc[missing_migrant["Cause of Death"].str.contains("bacterial infections|head injury|cardiac arrest|pneumonia|organ failure|sickness|hypoglycemia|post-partum complications"),
                       "Cause of Death"] = "Health Conditions"
    missing_migrant.loc[missing_migrant["Cause of Death"].str.contains("seizure|respiratory illness|pulmonary edema|cervical cancer|heart attack|coronary artery atherosclerosis"), 
                       "Cause of Death"] = "Health Conditions"
    missing_migrant.loc[missing_migrant["Cause of Death"].str.contains("dehydration|dehyration"),
                       "Cause of Death"] = "Dehyrdation"
    missing_migrant.loc[missing_migrant["Cause of Death"].str.contains("crocodile|hippopotamus|hippoptamus|envenomation"),
                       "Cause of Death"] = "Killed by animals"
    missing_migrant.loc[missing_migrant["Cause of Death"].str.contains("suicide|hanging"),
                       "Cause of Death"] = "Mental Health"
    missing_migrant.loc[missing_migrant["Cause of Death"].str.contains("lack of access to medicine"),
                       "Cause of Death"] = "Lack of access to medicine"
    missing_migrant.loc[missing_migrant["Cause of Death"].str.contains("fuel burns|burned"),
                       "Cause of Death"] = "Burns"
    
    return missing_migrant 


In [None]:
clustered_death=compute_clustered_deaths(missing_migrant)

In [None]:
clustered_death["Cause of Death"].value_counts()

In [None]:
sns.countplot(x="Cause of Death", data=missing_migrant, order=missing_migrant["Cause of Death"].value_counts().index)
plt.title("Fig 2.6: Causes of death recorded", size = 14)
plt.xticks(rotation=90)
plt.show()

### Fig 2.6 shows the varying causes of death that have been grouped together based on similarity. Suffocation was the highest cause of death. The group suffocation includes the reported causes that state drowning,crushed and asphyxiation. Please refer to the groupings above for the rest of the groups. 