In [88]:
import pandas as pd
import matplotlib.pyplot as plt

In [89]:
#Import data
homelessness = pd.read_csv("homelessness.csv")


In [90]:
#Inspecting a DataFrame

#Returns the first few rows of the dataframe.
homelessness.head()


Unnamed: 0,region,state,individuals,family_members,state_pop
0,East South Central,Alabama,2570.0,864.0,4887681
1,Pacific,Alaska,1434.0,582.0,735139
2,Mountain,Arizona,7259.0,2606.0,7158024
3,West South Central,Arkansas,2280.0,432.0,3009733
4,Pacific,California,109008.0,20964.0,39461588


In [28]:
#Shows information on each of the columns.
homelessness.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   region          51 non-null     object 
 1   state           51 non-null     object 
 2   individuals     51 non-null     float64
 3   family_members  51 non-null     float64
 4   state_pop       51 non-null     int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 2.1+ KB


In [30]:
#Returns the number of rows and the columns of the dataframe.

homelessness.shape

(51, 5)

In [33]:
#Calculates a few summary statistics for each column
homelessness.describe()

Unnamed: 0,individuals,family_members,state_pop
count,51.0,51.0,51.0
mean,7225.784314,3504.882353,6405637.0
std,15991.025083,7805.411811,7327258.0
min,434.0,75.0,577601.0
25%,1446.5,592.0,1777414.0
50%,3082.0,1482.0,4461153.0
75%,6781.5,3196.0,7340946.0
max,109008.0,52070.0,39461590.0


Sorting rows

In [39]:
# Sort homelessness by individuals
homelessness_ind = homelessness.sort_values("individuals")
print(homelessness_ind.head())

                region         state  individuals  family_members  state_pop
50            Mountain       Wyoming        434.0           205.0     577601
34  West North Central  North Dakota        467.0            75.0     758080
7       South Atlantic      Delaware        708.0           374.0     965479
39         New England  Rhode Island        747.0           354.0    1058287
45         New England       Vermont        780.0           511.0     624358


In [40]:
# Sort homelessness by descending family members
homelessness_fam = homelessness.sort_values("family_members", ascending=False)
print(homelessness_fam.head())

                region          state  individuals  family_members  state_pop
32        Mid-Atlantic       New York      39827.0         52070.0   19530351
4              Pacific     California     109008.0         20964.0   39461588
21         New England  Massachusetts       6811.0         13257.0    6882635
9       South Atlantic        Florida      21443.0          9587.0   21244317
43  West South Central          Texas      19199.0          6111.0   28628666


In [41]:
# Sort homelessness by region, then descending family members
homelessness_reg_fam = homelessness.sort_values(["region", "family_members"], ascending=[True, False])
print(homelessness_reg_fam.head())

                region      state  individuals  family_members  state_pop
13  East North Central   Illinois       6752.0          3891.0   12723071
35  East North Central       Ohio       6929.0          3320.0   11676341
22  East North Central   Michigan       5209.0          3142.0    9984072
49  East North Central  Wisconsin       2740.0          2167.0    5807406
14  East North Central    Indiana       3776.0          1482.0    6695497


Subsetting columns

In [42]:
# Select only the individuals and state columns, in that order
ind_state = homelessness[["individuals", "state"]]
print(ind_state.head())

   individuals       state
0       2570.0     Alabama
1       1434.0      Alaska
2       7259.0     Arizona
3       2280.0    Arkansas
4     109008.0  California


Subsetting rows

In [44]:
# Filter for rows where individuals is greater than 10000
ind_gt_10k = homelessness[(homelessness["individuals"]>10000)]
print(ind_gt_10k)

                region       state  individuals  family_members  state_pop
4              Pacific  California     109008.0         20964.0   39461588
9       South Atlantic     Florida      21443.0          9587.0   21244317
32        Mid-Atlantic    New York      39827.0         52070.0   19530351
37             Pacific      Oregon      11139.0          3337.0    4181886
43  West South Central       Texas      19199.0          6111.0   28628666
47             Pacific  Washington      16424.0          5880.0    7523869


In [45]:
# Filter for rows where region is Mountain
mountain_reg = homelessness[homelessness["region"]=="Mountain"]
print(mountain_reg)

      region       state  individuals  family_members  state_pop
2   Mountain     Arizona       7259.0          2606.0    7158024
5   Mountain    Colorado       7607.0          3250.0    5691287
12  Mountain       Idaho       1297.0           715.0    1750536
26  Mountain     Montana        983.0           422.0    1060665
28  Mountain      Nevada       7058.0           486.0    3027341
31  Mountain  New Mexico       1949.0           602.0    2092741
44  Mountain        Utah       1904.0           972.0    3153550
50  Mountain     Wyoming        434.0           205.0     577601


In [43]:
# Filter for rows where family_members is less than 1000 
# and region is Pacific
fam_lt_1k_pac = homelessness[(homelessness["family_members"]< 1000)& (homelessness["region"] =="Pacific")]
print(fam_lt_1k_pac)

    region   state  individuals  family_members  state_pop
1  Pacific  Alaska       1434.0           582.0     735139


Subsetting rows by categorical variables

In [47]:
# Subset for rows in South Atlantic or Mid-Atlantic regions
south_mid_atlantic = homelessness[(homelessness["region"] == "South Atlantic") | (homelessness["region"] == "Mid-Atlantic")]
print(south_mid_atlantic.head())

            region                 state  individuals  family_members  \
7   South Atlantic              Delaware        708.0           374.0   
8   South Atlantic  District of Columbia       3770.0          3134.0   
9   South Atlantic               Florida      21443.0          9587.0   
10  South Atlantic               Georgia       6943.0          2556.0   
20  South Atlantic              Maryland       4914.0          2230.0   

    state_pop  
7      965479  
8      701547  
9    21244317  
10   10511131  
20    6035802  


In [49]:
# The Mojave Desert states
canu = ["California", "Arizona", "Nevada", "Utah"]

# Filter for rows in the Mojave Desert states
mojave_homelessness = homelessness[homelessness["state"].isin(canu)]
print(mojave_homelessness.head())

      region       state  individuals  family_members  state_pop
2   Mountain     Arizona       7259.0          2606.0    7158024
4    Pacific  California     109008.0         20964.0   39461588
28  Mountain      Nevada       7058.0           486.0    3027341
44  Mountain        Utah       1904.0           972.0    3153550


Adding new columns

In [53]:
# Add total col as sum of individuals and family_members


homelessness["total"] = homelessness["individuals"] + homelessness["family_members"]
print(homelessness.head())

               region       state  individuals  family_members  state_pop  \
0  East South Central     Alabama       2570.0           864.0    4887681   
1             Pacific      Alaska       1434.0           582.0     735139   
2            Mountain     Arizona       7259.0          2606.0    7158024   
3  West South Central    Arkansas       2280.0           432.0    3009733   
4             Pacific  California     109008.0         20964.0   39461588   

      total  
0    3434.0  
1    2016.0  
2    9865.0  
3    2712.0  
4  129972.0  


In [54]:
# Add p_individuals col as proportion of total that are individuals
homelessness["p_individuals"] = homelessness["individuals"] / homelessness["total"]
print(homelessness.head())
                                                                           

               region       state  individuals  family_members  state_pop  \
0  East South Central     Alabama       2570.0           864.0    4887681   
1             Pacific      Alaska       1434.0           582.0     735139   
2            Mountain     Arizona       7259.0          2606.0    7158024   
3  West South Central    Arkansas       2280.0           432.0    3009733   
4             Pacific  California     109008.0         20964.0   39461588   

      total  p_individuals  
0    3434.0       0.748398  
1    2016.0       0.711310  
2    9865.0       0.735834  
3    2712.0       0.840708  
4  129972.0       0.838704  


In [68]:
# indiv_per_10k col as homeless individuals per 10k state pop
homelessness["indiv_per_10k"]=10000*homelessness["individuals"]/homelessness["state_pop"]
print(homelessness.head())


               region       state  individuals  family_members  state_pop  \
0  East South Central     Alabama       2570.0           864.0    4887681   
1             Pacific      Alaska       1434.0           582.0     735139   
2            Mountain     Arizona       7259.0          2606.0    7158024   
3  West South Central    Arkansas       2280.0           432.0    3009733   
4             Pacific  California     109008.0         20964.0   39461588   

      total  p_individuals  indiv_per_10k  
0    3434.0       0.748398       5.258117  
1    2016.0       0.711310      19.506515  
2    9865.0       0.735834      10.141067  
3    2712.0       0.840708       7.575423  
4  129972.0       0.838704      27.623825  


In [69]:
# Subset rows for indiv_per_10k greater than 20
high_homelessness = homelessness[homelessness["indiv_per_10k"] > 20]
print(high_homelessness.head())

            region                 state  individuals  family_members  \
4          Pacific            California     109008.0         20964.0   
8   South Atlantic  District of Columbia       3770.0          3134.0   
11         Pacific                Hawaii       4131.0          2399.0   
28        Mountain                Nevada       7058.0           486.0   
32    Mid-Atlantic              New York      39827.0         52070.0   

    state_pop     total  p_individuals  indiv_per_10k  
4    39461588  129972.0       0.838704      27.623825  
8      701547    6904.0       0.546060      53.738381  
11    1420593    6530.0       0.632619      29.079406  
28    3027341    7544.0       0.935578      23.314189  
32   19530351   91897.0       0.433387      20.392363  


In [71]:
# Sort high_homelessness by descending indiv_per_10k
high_homelessness_srt = high_homelessness.sort_values("indiv_per_10k", ascending=False)
print(high_homelessness_srt.head())


            region                 state  individuals  family_members  \
8   South Atlantic  District of Columbia       3770.0          3134.0   
11         Pacific                Hawaii       4131.0          2399.0   
4          Pacific            California     109008.0         20964.0   
37         Pacific                Oregon      11139.0          3337.0   
28        Mountain                Nevada       7058.0           486.0   

    state_pop     total  p_individuals  indiv_per_10k  
8      701547    6904.0       0.546060      53.738381  
11    1420593    6530.0       0.632619      29.079406  
4    39461588  129972.0       0.838704      27.623825  
37    4181886   14476.0       0.769481      26.636307  
28    3027341    7544.0       0.935578      23.314189  


In [66]:

# From high_homelessness_srt, select the state and indiv_per_10k cols
result = high_homelessness_srt[["state", "indiv_per_10k"]]
print(result)

                   state  indiv_per_10k
8   District of Columbia      53.738381
11                Hawaii      29.079406
4             California      27.623825
37                Oregon      26.636307
28                Nevada      23.314189
47            Washington      21.829195
32              New York      20.392363


In [73]:
# Add total col as sum of individuals and family_members
homelessness["total"] = homelessness["individuals"] + homelessness["family_members"]

# Add p_individuals col as proportion of total that are individuals
homelessness["p_individuals"] = homelessness["individuals"] / homelessness["total"]

# See the result
print(homelessness.head())

               region       state  individuals  family_members  state_pop  \
0  East South Central     Alabama       2570.0           864.0    4887681   
1             Pacific      Alaska       1434.0           582.0     735139   
2            Mountain     Arizona       7259.0          2606.0    7158024   
3  West South Central    Arkansas       2280.0           432.0    3009733   
4             Pacific  California     109008.0         20964.0   39461588   

      total  p_individuals  indiv_per_10k  
0    3434.0       0.748398       5.258117  
1    2016.0       0.711310      19.506515  
2    9865.0       0.735834      10.141067  
3    2712.0       0.840708       7.575423  
4  129972.0       0.838704      27.623825  


In [74]:
# Create indiv_per_10k col as homeless individuals per 10k state pop
homelessness["indiv_per_10k"] = 10000 * homelessness["individuals"] / homelessness["state_pop"] 

# Subset rows for indiv_per_10k greater than 20
high_homelessness = homelessness[homelessness["indiv_per_10k"] > 20]

# Sort high_homelessness by descending indiv_per_10k
high_homelessness_srt = high_homelessness.sort_values("indiv_per_10k", ascending=False)

# From high_homelessness_srt, select the state and indiv_per_10k cols
result = high_homelessness_srt[["state", "indiv_per_10k"]]

# See the result

print(result)

                   state  indiv_per_10k
8   District of Columbia      53.738381
11                Hawaii      29.079406
4             California      27.623825
37                Oregon      26.636307
28                Nevada      23.314189
47            Washington      21.829195
32              New York      20.392363


In [79]:

# Print the mean of individuals
print(homelessness["individuals"].mean())

# Print the median of individuals
print(homelessness['individuals'].median())

# Print the maximum of the individuals column
print(homelessness["individuals"].max())

# Print the minimum of the individuals  column
print(homelessness["individuals"].min())

7225.78431372549
3082.0
109008.0
434.0


In [82]:
# A custom IQR function - The .agg() method allows you to apply your own custom functions to a DataFrame,
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)
    
# Print IQR of the temperature_c column
print(homelessness["family_members"].agg(iqr))

2604.0


In [85]:
#Update the column selection to use the custom iqr function with .agg()

# A custom IQR function
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)

# Update to print IQR of "family_members","individuals", "indiv_per_10k"]
print(homelessness[["family_members","individuals", "indiv_per_10k"]].agg(iqr))

family_members    2604.000000
individuals       5335.000000
indiv_per_10k        3.944916
dtype: float64


In [87]:
# Import NumPy and create custom IQR function
import numpy as np
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)

# Update to print IQR and median of"family_members","individuals", "indiv_per_10k"
print(homelessness[["family_members","individuals", "indiv_per_10k"]].agg([iqr, np.median]))

        family_members  individuals  indiv_per_10k
iqr             2604.0       5335.0       3.944916
median          1482.0       3082.0       7.122409
