#### Let's dig in!

#### Give me a thumbs up if you liked it! And I'm always open to suggestions!

##### Some key insights from this dataset is as follows:
    
   * Most number of deaths have occured in 3rd week!
   * Drowning is the primary reason for cause of death.
   * African countries are most effected in terms of number of deaths.
   * Syria and Mexico are most effected.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

%matplotlib inline

In [None]:
data = pd.read_csv("../input/MissingMigrantsProject.csv", encoding = "cp437")
data.head()

#### Data sanity check. 

In [None]:
data.isnull().sum()

#### Creating date features.

In [None]:
#Date features. 
data['date_day'] = pd.DatetimeIndex(data['date']).day
data['date_month'] = pd.DatetimeIndex(data['date']).month
data['date_year'] = pd.DatetimeIndex(data['date']).year

data['date_day'].value_counts().to_frame().plot(kind = "bar")

##### Inference : There isn't seem to be a strong pattern here. But people seem to move more during 17th, 18th, 19th, 20th. 

### Can we find any pattern in day of the week?

In [None]:
#That is 
# Monday - 0
# Tuesday - 1 etc. 
data['date_dayofweek'] = pd.DatetimeIndex(data['date']).dayofweek

In [None]:
data['date_dayofweek'].value_counts().plot(kind = "bar")

### Can we find any pattern in months?

In [None]:
data['date_month'].value_counts().to_frame().plot(kind = "bar")

In [None]:
print(data['date_day'].isnull().sum())
print(data['date_day'].value_counts().head(5))




In [None]:
#subsituting with 20
data['date_day'] = data['date_day'].fillna(20.0)
print(data['date_day'].isnull().sum())

### Can we find any pattern in week number?

In [None]:
import math
data['week_number'] = [math.ceil(datum/7) for datum in data['date_day']]
data['week_number'].value_counts().plot(kind = 'bar')

#### Inference: Looks like we found one. It looks like week 3 is when people move more to cross borders. 

## Cause of Death and substitution. 

In [None]:
#Lets do some data replacement. 
data['cause_of_death'] = data['cause_of_death'].fillna("Drowning")
data['cause_of_death'] = list(map(lambda string: string.lower(), data['cause_of_death']))
data['cause_of_death'].unique()

In [None]:
data['cause_of_death'].value_counts()

#### That's a lot of classes. Let's reduce them. 

In [None]:
#Lets lessen the classes for ease. 
import re

def deathcause_replacement():
    global data
    data.loc[data['cause_of_death'].str.contains('sickness'), 'cause_of_death'] = 'sickness'
    data.loc[data['cause_of_death'].str.contains('harsh_weather'), 'cause_of_death'] = 'harsh_weather'
    data.loc[data['cause_of_death'].str.contains('unknown|unknow|north africa'), 'cause_of_death'] = 'unknown'
    data.loc[data['cause_of_death'].str.contains('starvation|dehydration'), 'cause_of_death'] = 'starvation'
    data.loc[data['cause_of_death'].str.contains('drowning|pulmonary|respiratory|lung|bronchial|pneumonia'), 'cause_of_death'] = 'drowning'
    data.loc[data['cause_of_death'].str.contains('hyperthermia'), 'cause_of_death'] = 'hypothermia'
    data.loc[data['cause_of_death'].str.contains('hypothermia'), 'cause_of_death'] = 'hypothermia'
    data.loc[data['cause_of_death'].str.contains('asphyxiation|suffocation'), 'cause_of_death'] = 'asphyxiation'
    data.loc[data['cause_of_death'].str.contains('train|bus|vehicle|truck|boat|car|road|van'), 'cause_of_death'] = 'vehicle accident'
    data.loc[data['cause_of_death'].str.contains('murder|stab|shot|violent|blunt force|violence|beat-up|fight|murdured|death'), 'cause_of_death'] = 'murder'
    data.loc[data['cause_of_death'].str.contains('crushed to death|crush'), 'cause_of_death'] = 'crushed'
    data.loc[data['cause_of_death'].str.contains('harsh conditions|harsh_weather'), 'cause_of_death'] = 'harsh conditions'
    data.loc[data['cause_of_death'].str.contains('diabetic|heart attack|sickness|meningitis|virus|cancer|bleeding|insuline|inhalation'), 'cause_of_death'] = 'health condition'
    data.loc[data['cause_of_death'].str.contains('electrocution'), 'cause_of_death'] = 'electrocution'

In [None]:
deathcause_replacement()

data['cause_of_death'].unique()

In [None]:
data['cause_of_death'].value_counts().plot(kind = "bar")

#### Inference: Looks like drowning is the primary reason for death. 

In [None]:
inspect = data['cause_of_death'].value_counts().to_frame().reset_index()
inspect.columns = ['cause_of_death', "death_count"]

name_list = inspect.loc[inspect['death_count'] >5 ]['cause_of_death'].tolist()

In [None]:
data = data.loc[data['cause_of_death'].isin(name_list)]

In [None]:
data['cause_of_death'].value_counts().plot(kind = "bar", title = "Reason for death")

## Can we do something with latitude and longitude values?

In [None]:
[np.mean(data['lat']), np.mean(data['lon'])]

In [None]:
data['lon'] = data['lon'].fillna(np.mean(data['lon']))    #data['lon'][~np.isnan(data['lon'])].mean()
data['lat'] = data['lat'].fillna(np.mean(data['lat']))    #data['lon'][~np.isnan(data['lon'])].mean()

In [None]:
sns.factorplot(x = "lat", y = "lon", hue = "cause_of_death", kind = "swarm", data = data)

#### I tried to plot them on a map. But I wasn't able to. Would love your suggestions here. 

## Who are more effected in terms of region?

In [None]:
data['region_origin'].value_counts().plot(kind = "bar")

In [None]:
data['incident_region'].value_counts().plot(kind = "bar")

In [None]:
data.groupby('region_origin')['dead'].sum().to_frame().plot(kind = "bar")

### I see a lot of different variants of Africa. Let's combine all of them!

In [None]:
#Lets change it a bit to make it more concrete!
data['region_origin'].unique()

In [None]:
data['region_origin'] = data['region_origin'].fillna('Africa')

In [None]:
data.loc[data['region_origin'].str.contains('Africa'), 'region_origin'] = 'Africa'
data.groupby('region_origin')['dead'].sum().to_frame().plot(kind = "bar")

In [None]:
data.groupby('region_origin')['dead'].sum().to_frame()


#### *That's staggering! Of all the deaths in this dataset 60% of the deaths were from Africa!*

### Assuming missing people are dead!

In [None]:
#Let's combine missing and dead. 
data['missing'].value_counts().head(10)

In [None]:
data['missing'] = data['missing'].fillna(1)
data['missing_and_dead'] = data['missing'] + data['dead']

data['missing_and_dead'].value_counts().head(10)

In [None]:
data.groupby('region_origin')['missing_and_dead'].sum().to_frame().plot(kind = "bar", stacked = True)

### Which country is more affected?

In [None]:
data['affected_nationality'].value_counts().head(15).plot(kind = "bar")