# US Veteran data analysis report

This report is majorly done with matplotlib instead of any advanced plotting library.

In [None]:
# library imports

%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Getting the data ready

In [None]:
file_path = '../input/us-veteran-suicides/'
year_range = list(range(2005, 2012, 1))
df = pd.DataFrame()
for i in year_range:
    file_name = file_path + str(i) + '.csv'
    temp = pd.read_csv(file_name)
    temp['year'] = i
    df = df.append(temp)

df.drop('Unnamed: 0', axis=1, inplace=True)

Dropping axis with `NA` values and also male/female ratio (just because).

In [None]:
df.dropna(axis=1, inplace=True)
df.drop(['vet_males_p', 'vet_females_p'], axis=1, inplace=True)

Let's see the percentage of veterans per state

In [None]:
pd.plotting.boxplot(data=df, column='vet_pop_p', by='state', figsize=(20, 6))
_ = plt.xticks(rotation=90)
plt.title(' ')
plt.ylabel('%age of veterans')
plt.xlabel('States')
plt.tight_layout()

`Alaska` and `Wyoming` have the highest veteran percentage, followed closely by `Maine`, `Montana`, and `Virginia`.

# Trend in veteran population and veteran suicide

In [None]:
f, a = plt.subplots(6, 1, sharex=True, figsize=(20, 15), gridspec_kw = {'hspace': 0})
for i, v in enumerate(year_range):
    try:
        tdf1 = df[df['year'] == v]
        tdf2 = df[df['year'] == year_range[i+1]] 
        a[i].plot(tdf1['state'], tdf1['vet_pop']/1e6, marker='o', label='Vet. pop. in {}'.format(v), color='r')
        a[i].plot(tdf2['state'], tdf2['vet_pop']/1e6, marker='o', label='Vet. pop. in {}'.format(year_range[i+1]), color='g')
        a[i].set_ylabel('Veteran population\nin Millions')
        a[i].legend(loc='upper left')
        plt.xticks(rotation=90)
        plt.tight_layout()
        plt.xlabel('States')
        a0 = a[i].twinx()
        a0.plot(tdf1['state'], tdf1['vet_suicides']/1e3, marker='o', label='Vet. suicides in {}'.format(v), color='b')
        a0.plot(tdf2['state'], tdf2['vet_suicides']/1e3, marker='o', label='Vet. suicides in {}'.format(year_range[i+1]), color='y')
        a0.set_ylabel('Veteran suicides\nin Thousands')
        a0.legend(loc='upper right')
    except Exception:
        pass

It is a very clear confirmation of the obvious intuition that the state that produces the most veteran has the most suicde rates. Surprisingly, `California` produces the most veterns, followed by `Florida` and than`Texas` even though the percentage of these two is not mentioned in the above discussion.

# Veteran population and suicide over all the years

In [None]:
df_vet_pop = df[['state', 'vet_pop', 'year', 'vet_suicides']]
df_vet_pop_scde = df[['state', 'vet_pop', 'year', 'vet_suicides', 'all_suicides', 'overall_pop_18']]

In [None]:
df_vet_pop_sort = pd.DataFrame(df_vet_pop.groupby('state')['vet_pop'].sum()).reset_index().sort_values('vet_pop', ascending=False)
df_vet_scd_sort = pd.DataFrame(df_vet_pop.groupby('state')['vet_suicides'].sum()).reset_index().sort_values(
    'vet_suicides', ascending=False)
s_count = np.array([i for i in df_vet_pop_sort['vet_pop']])
ss_count = np.array([i for i in df_vet_scd_sort['vet_suicides']])
s_names = np.array([i for i in df_vet_pop_sort['state']])

In [None]:
plt.figure(figsize=(20, 6))
plt.bar(s_names, s_count/1e6, label='Veteran population [in millions]')
_ = plt.xticks(rotation=90)
plt.xlabel('States')
plt.ylabel('Veterans [Millions]')
plt.legend(loc='best')
_ = plt.title('Veteran statistics from year {} to {}.'.format(year_range[0], year_range[-1]))
plt.twiny()
plt.bar(s_names, ss_count/1e3, fc='g', label='Veteran suicides [in thousands]')
plt.legend(loc='center right')
plt.xticks('')

# Veteran, civilian and overall suicide rates

In [None]:
df_vet_pop_scde = df_vet_pop_scde.groupby('state')[[
    'vet_suicides', 'all_suicides', 'vet_pop', 'overall_pop_18']].sum().reset_index()
df_vet_pop_scde = df_vet_pop_scde.sort_values('vet_pop', ascending=False).reset_index()
df_vet_pop_scde.drop('index', axis=1, inplace=True)
df_vet_pop_scde['civ_suicides'] = df_vet_pop_scde['all_suicides'] - df_vet_pop_scde['vet_suicides']

In [None]:
df_vet_pop_scde = df_vet_pop_scde[['state', 'overall_pop_18', 'vet_pop', 'vet_suicides', 'civ_suicides', 'all_suicides']]
df_vet_pop_scde.head()

In [None]:
df_vet_pop_scde['civ_pop'] = df_vet_pop_scde['overall_pop_18'] - df_vet_pop_scde['vet_pop']

In [None]:
df_vet_pop_scde['vet_scde_rate'] = (df_vet_pop_scde['vet_suicides']/df_vet_pop_scde['vet_pop'])*100
df_vet_pop_scde['civ_scde_rate'] = (df_vet_pop_scde['civ_suicides']/df_vet_pop_scde['civ_pop'])*100
df_vet_pop_scde['all_scde_rate'] = (df_vet_pop_scde['all_suicides']/df_vet_pop_scde['overall_pop_18'])*100

In [None]:
vet_scde = np.array([i for i in df_vet_pop_scde['vet_suicides']])
all_scde = np.array([i for i in df_vet_pop_scde['all_suicides']])
civ_scde = np.array([i for i in df_vet_pop_scde['civ_suicides']])

vet_scde_rate = np.array([i for i in df_vet_pop_scde['vet_scde_rate']])
all_scde_rate = np.array([i for i in df_vet_pop_scde['all_scde_rate']])
civ_scde_rate = np.array([i for i in df_vet_pop_scde['civ_scde_rate']])

In [None]:
f, a = plt.subplots(2, 1, figsize=(20, 10), sharex=True, gridspec_kw={'hspace': 0})
a[0].plot(s_names, vet_scde, 'r-o', label='Veteran suicide cases')
a[0].plot(s_names, civ_scde, 'g-o', label='Civilian suicide cases')
a[0].plot(s_names, all_scde, 'b-o', label='Total suicide cases')
a[0].set_yscale('log')
a[0].set_ylabel('Number of cases')
a[0].grid('on')
a[0].legend(loc='best')
a[1].plot(s_names, vet_scde_rate, 'y-o', label='Veteran suicide rate')
a[1].plot(s_names, civ_scde_rate, 'm-o', label='Civilian suicidie rate')
a[1].plot(s_names, all_scde_rate, color='lime', ls='-', marker='o', label='Total suicide rate')
a[1].set_ylabel('Percentage of suicide rates')
a[1].grid('on')
a[1].legend(loc='best')
_ = plt.xticks(rotation=90)
_ = plt.xlabel('States')
_ = plt.tight_layout()

That's all for now