In [2]:
# We import our packages
import pandas as pd
import numpy as np
import re
from matplotlib.ticker import MaxNLocator
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Importing the final dataframe of Milestone 2
df = pd.read_pickle('Pickles/Final_dataframe.pickle')
df.head()

Unnamed: 0,State,Price_2018_Studio,Price_2018_1br,Price_2018_2br,Price_2018_3br,Price_2018_4br,"Overall Homeless, 2018","Overall Homeless, 2017","Overall Homeless, 2016","Overall Homeless, 2015",...,Theft,Motor_Vehicle_theft,Per_capita_income,Life_Expectancy,N_of_colleges_universities,N_of_junior_colleges,N_of_technical_trade_schools,awards_per_value,exp_award_value,top_230_ranking_score
0,alabama,605.416667,659.0,800.666667,1069.666667,1244.333333,3434,3793,4111,3970,...,2006.3,241.1,38215.0,74.813987,42.0,36.0,16.0,19.514493,63013.173913,366
1,alaska,814.333333,930.5,1190.333333,1682.166667,2019.5,2016,1845,1940,1956,...,2394.7,412.1,54430.0,78.915541,6.0,1.0,2.0,18.414286,141431.142857,29
2,arizona,701.0,828.0,1046.0,1512.333333,1754.416667,9865,8947,9707,9896,...,2168.1,265.8,39955.0,78.364742,46.0,43.0,42.0,25.563492,47830.888889,209
3,arkansas,524.666667,571.166667,721.416667,977.583333,1144.833333,2712,2467,2463,2560,...,2233.6,239.4,39171.0,75.6269,26.0,26.0,32.0,21.970833,51132.479167,96
4,california,1212.333333,1426.333333,1818.916667,2519.25,2926.333333,129972,131532,118142,115738,...,1623.0,450.3,54800.0,80.231014,264.0,191.0,239.0,22.771429,63022.202857,3064


# How ADA can help you to choose where to live
## A data analysis based on *quality of life* in the USA

In our lives, we must do some choices.
Probably, one of the most difficult ones is the place we want to live in. Of course the right environment is necessary to live the *life of our dreams*.
The *quality* of a place is something absolutely subjective and depends on personal factors, like "places of the heart", closeness to family and friends, possibility to do activities each person likes...

At the same time, we can underline some features that are **general and objective** when describing the *quality* of places: rate of education, health, security, income...
Basing on these features, sociologists tried to perform some indicative analysis to decide if some places are objectively better than others, and to select these places.

The **Human Development Index (HDI)** is a statistic composite index of life expectancy, education, and per capita income indicators, which are used to rank countries into four tiers of human development. A country scores a higher HDI when the lifespan is higher, the education level is higher, and the gross national income GNI (PPP) per capita is higher. [1]

Basing on this universally recognised index, we want to personalize our analysis.
The classic HDI gives fixed weights to each feature. But every person has its own preferencies and necessities. Some people could consider the possinility of obtaining an high income as a must, while others would barter it with a longer and healthy life, or with security.

In our analysis we selected data from different sources (governative datasets, websites...) and in different formats (csv, xml, html...) in order to obtain different features for different **USA states**.
We obtained more than 40 different features, related to different macrocategories:
- **Education**;
- **Health**;
- **Security**;
- **Economy**.

For each macrocategory, we analised all data related to all categories, in order to evaluate all states giving them a *score* for each field.
These scores are used to evaluate which state in the USA is the **best place to live**.

In [8]:
education_list = ['State', 'Population', 'N_of_colleges_universities', 'N_of_junior_colleges', 
                  'N_of_technical_trade_schools', 'awards_per_value', 'exp_award_value', 'top_230_ranking_score',
                  'High_School_Fee', 'Elementary_School_Fee']
health_list = ['State', 'Population', 'Life_Expectancy', 'mc_donalds_per_100k', 'adult_obesity_rate',
               'eating_vegetables_daily', 'Vegetable', 'diabetes_prevalence', 'alcohol_prevalence',
               'mean_physical_activity', 'mean_obesity']
security_list = ['State', 'Population', 'alcohol_prevalence', 'Overall Homeless, 2018', 'Violent_Crime',
                 'Murder_and_Manslaughter', 'Rape', 'Robbery', 'Aggravated_Assoult', 'Property_crime',
                 'Burglary', 'Theft', 'Motor_Vehicle_theft']
economy_list = ['State', 'Population', 'Per_capita_income', 'mc_donalds_per_100k', 'Price_2018_Studio',
                'Price_2018_1br',  'Price_2018_2br', 'Price_2018_3br', 'Price_2018_4br', 'Overall Homeless, 2018',
                'Elementary_School_Fee', 'High_School_Fee']
miscellaneous_list = ['State', 'Population']

## Education

In [9]:
# We create the "education" DataFrame basing on the final DataFrame of milestone 2
df_education = df[education_list]
df_education.head()

Unnamed: 0,State,Population,N_of_colleges_universities,N_of_junior_colleges,N_of_technical_trade_schools,awards_per_value,exp_award_value,top_230_ranking_score,High_School_Fee,Elementary_School_Fee
0,alabama,4863300,42.0,36.0,16.0,19.514493,63013.173913,366,7633,6388
1,alaska,741894,6.0,1.0,2.0,18.414286,141431.142857,29,6118,7544
2,arizona,6931071,46.0,43.0,42.0,25.563492,47830.888889,209,17339,6300
3,arkansas,2988248,26.0,26.0,32.0,21.970833,51132.479167,96,6580,4724
4,california,39250017,264.0,191.0,239.0,22.771429,63022.202857,3064,19235,11360


The **Education** score is calculated basing on the following features:
- *Number of colleges and universities* in each state in 2015;
- *Number of junior colleges* in each state in 2015;
- *Number of professional schools* (technical schools, trade schools...) in each state in 2015;
- *Number of degrees per 100 students* in 2015: the higher this parameter is, the higher the rate of success of students is;
- *Institutional expenditure per degree* in 2015: the higher this parameter is, the better the educational services offered are supposed to be;


#### References

[1] https://en.wikipedia.org/wiki/Human_Development_Index