In [None]:
# imports and so on
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

print("Setup Complete")

# Looking at randomly picked dataset

### First, load the data

In [None]:
file_path='../input/suicide-rates-overview-1985-to-2016/master.csv'

my_data = pd.read_csv(file_path)

### Then, look over it

In [None]:
my_data.head(15)

In [None]:
my_data.describe()

In [None]:
my_data.info()

### Check some details about it

In [None]:
countries = my_data.groupby('country').country.unique()
print("There are", countries.count(), "countries")

years_list = sorted(my_data.year.unique())
years = len(years_list)
print("There are", years, "years in the dataset:", years_list)

### Drop redundant data

In [None]:
my_data_trimmed = my_data.drop(['suicides/100k pop', 'country-year'], axis=1)

### Clean it up a bit

In [None]:
my_data_trimmed.columns = my_data_trimmed.columns.str.strip()
my_data_trimmed['gdp_for_year ($)'] = my_data_trimmed['gdp_for_year ($)'].str.replace(',','').astype('int64')
my_data_trimmed.head()

### A trend chart

Just a simple line plot, nothing fancy. Since there are many countries in the file, just focus on some of them.

First, group the data by country and year, summing values.
It's too detailed in the original data set for the trend charting purpose, I want to compare countries by suicides no matter of sex and age .

In [None]:
colNames = ['suicides_no', 'population']
countriesData = my_data_trimmed.groupby(['country', 'year'])[colNames].sum()

print(countriesData.head())

### Compute the suicide number/100000 for each country 

In [None]:
fieldToChart = 'suicides_avg' 
countriesData[fieldToChart] = countriesData.apply(lambda x: x['suicides_no']/x['population'] * 100000, axis =  1)
countriesData = countriesData.drop(colNames, axis=1)

print(countriesData.head(10))

### The same for the 'whole world' (not all countries are present)

In [None]:
worldData = my_data_trimmed.groupby('year')[colNames].sum()

print(worldData.head())

worldData[fieldToChart] = worldData.apply(lambda x: x['suicides_no']/x['population'] * 100000, axis =  1)
worldData.drop(colNames, axis=1, inplace=True)

print(worldData.head())


### Now chart some countries and the 'world' average

In [None]:
countries_to_chart = ['Albania', 'Romania', 'France', 'Hungary', 'United States']

plt.figure(figsize=(14,8))
plt.title('Average suicide rates by year')
plt.xlabel('Year')
plt.ylabel('Suicides')
for country in countries_to_chart:
    data = countriesData.loc[country]
    sns.lineplot(data=data[fieldToChart], label=country)
    
sns.lineplot(data=worldData[fieldToChart], label='World')

### Look at generations data

In [None]:
genData = my_data_trimmed.groupby('generation')[colNames].sum()

print(genData.head())

genData[fieldToChart] = genData.apply(lambda x: x['suicides_no']/x['population'] * 100000, axis =  1)
genData.drop(colNames, axis=1, inplace=True)

print(genData.head())

In [None]:
plt.figure(figsize=(14,8))
plt.title('Avg suicides by generation')
sns.barplot(x=genData.index, y=genData['suicides_avg'])

### Now let's look at detail, by sex

Don't mess with data, display it almost as it is, but recompute the suicide rates.
Color by sex.

In [None]:
my_detailed_data = my_data_trimmed.copy()
my_detailed_data[fieldToChart] = my_detailed_data.apply(lambda x: x['suicides_no']/x['population'] * 100000, axis =  1)


plt.figure(figsize=(14,8))
plt.title('Suicides by sex')
sns.scatterplot(x=my_detailed_data['gdp_per_capita ($)'], y=my_detailed_data[fieldToChart], hue=my_detailed_data['sex'])

#sns.lmplot(x='gdp_per_capita ($)', y=fieldToChart, hue='sex', data=my_detailed_data, aspect=1.5, height=8)

In [None]:
worldData = my_data_trimmed.groupby('sex')[colNames].sum()
worldData[fieldToChart] = worldData.apply(lambda x: x['suicides_no']/x['population'] * 100000, axis =  1)
worldData.drop(colNames, axis=1, inplace=True)

worldData

In [None]:
worldData.plot.pie(y='suicides_avg', figsize=(10,10))

How similar is for countries?

In [None]:
# remove records for countries that we're not interested in
countriesData = my_data_trimmed[my_data_trimmed['country'].isin(countries_to_chart)] 

sexData = countriesData.groupby(['country','sex'])[colNames].sum()

print(sexData.head())

sexData[fieldToChart] = sexData.apply(lambda x: x['suicides_no']/x['population'] * 100000, axis =  1)
sexData.drop(colNames, axis=1, inplace=True)

print(sexData.head())

In [None]:
sexData = sexData.reset_index()

#switchToNumerical = { 'sex' : { 'male' : 0 , 'female' : 1 }}
#sexData.replace(switchToNumerical, inplace=True)

sexData.head()

In [None]:
def pie(vals, lab, color=None):
    plt.pie(vals, labels=lab.values)
    
#grid = sns.FacetGrid(sexData, col='country')
#grid.map(pie, 'suicides_avg', 'sex')

#plt.show()

sexData.pivot('sex', 'country', 'suicides_avg').plot.pie(subplots=True, figsize=(35, 15))

### Let's look at correlations

In [None]:
sns.set()
sns.pairplot(my_detailed_data, height = 2.3)
plt.show();

In [None]:
correlations = my_detailed_data.corr()

correlations

In [None]:
plt.figure(figsize=(14,14))
plt.title('Correlation matrix')
sns.set(font_scale=1.5)
hm = sns.heatmap(correlations, cbar=True, annot=True, square=True, fmt='.4f', annot_kws={'size': 17}, yticklabels=correlations.columns.values, xticklabels=correlations.columns.values)
plt.show()