# Data Manipulation with Pandas (Code companion)

In [2]:
# import database to be used as a companion for this course
# Downloaded from kaggle
# Because of windows we have to escape the file path
import pandas as pd 
homelessness = pd.read_csv("C:\\repos\\data_notes\\databases\\Homelessness\\2007-2016-Homelessnewss-USA.csv")
print(homelessness.head())
print(homelessness.info())
print(homelessness.shape)
print(homelessness.describe())


Year State CoC Number       CoC Name  \
0  1/1/2007    AK     AK-500  Anchorage CoC   
1  1/1/2007    AK     AK-500  Anchorage CoC   
2  1/1/2007    AK     AK-500  Anchorage CoC   
3  1/1/2007    AK     AK-500  Anchorage CoC   
4  1/1/2007    AK     AK-500  Anchorage CoC   

                                     Measures Count  
0            Chronically Homeless Individuals   224  
1                        Homeless Individuals   696  
2                 Homeless People in Families   278  
3  Sheltered Chronically Homeless Individuals   187  
4                          Sheltered Homeless   842  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86529 entries, 0 to 86528
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Year        86529 non-null  object
 1   State       86529 non-null  object
 2   CoC Number  86529 non-null  object
 3   CoC Name    86529 non-null  object
 4   Measures    86529 non-null  object
 5   Count  

In [6]:
# We can sort by more than one column and considering diferent directions we just need to pass a list into the argument ascending

homelessness.columns
homelessness_reg_fam = homelessness.sort_values(["Year", "State"], ascending=[True, False])
print(homelessness_reg_fam.head())

Year State CoC Number               CoC Name  \
4812  1/1/2007    WY     WY-500  Wyoming Statewide CoC   
4813  1/1/2007    WY     WY-500  Wyoming Statewide CoC   
4814  1/1/2007    WY     WY-500  Wyoming Statewide CoC   
4815  1/1/2007    WY     WY-500  Wyoming Statewide CoC   
4816  1/1/2007    WY     WY-500  Wyoming Statewide CoC   

                                        Measures Count  
4812            Chronically Homeless Individuals    38  
4813                        Homeless Individuals   331  
4814                 Homeless People in Families   206  
4815  Sheltered Chronically Homeless Individuals     0  
4816                          Sheltered Homeless   397  


In [3]:
# now we extract the different components out of a dataframe
print(homelessness.values)
print(homelessness.columns)
print(homelessness.index)

[['1/1/2007' 'AK' 'AK-500' 'Anchorage CoC'
  'Chronically Homeless Individuals' '224']
 ['1/1/2007' 'AK' 'AK-500' 'Anchorage CoC' 'Homeless Individuals' '696']
 ['1/1/2007' 'AK' 'AK-500' 'Anchorage CoC' 'Homeless People in Families'
  '278']
 ...
 ['1/1/2016' 'WY' 'WY-500' 'Wyoming Statewide CoC'
  'Unsheltered Parenting Youth (Under 25)' '3']
 ['1/1/2016' 'WY' 'WY-500' 'Wyoming Statewide CoC'
  'Unsheltered Parenting Youth Age 18-24' '3']
 ['1/1/2016' 'WY' 'WY-500' 'Wyoming Statewide CoC'
  'Unsheltered Parenting Youth Under 18' '0']]
Index(['Year', 'State', 'CoC Number', 'CoC Name', 'Measures', 'Count'], dtype='object')
RangeIndex(start=0, stop=86529, step=1)


In [11]:
# We use [] in order to select only columns that matters to us
measures = homelessness["Measures"]
print(measures.head())

# In order to select multiple columns we will need to pass a list
state_measures = homelessness[["State", "Measures"]]
print(state_measures.head())

0              Chronically Homeless Individuals
1                          Homeless Individuals
2                   Homeless People in Families
3    Sheltered Chronically Homeless Individuals
4                            Sheltered Homeless
Name: Measures, dtype: object
  State                                    Measures
0    AK            Chronically Homeless Individuals
1    AK                        Homeless Individuals
2    AK                 Homeless People in Families
3    AK  Sheltered Chronically Homeless Individuals
4    AK                          Sheltered Homeless


In [23]:
# To filter rows we pass a boolean statement
print(
    homelessness[
        homelessness["Count"] > "300"
        ]
    )

# We can use & and | for multiple conditions
print(
    homelessness[
        (homelessness["State"] == "WY") &
        (homelessness["Count"] < "200")
    ]
)


Year State CoC Number               CoC Name  \
1      1/1/2007    AK     AK-500          Anchorage CoC   
4      1/1/2007    AK     AK-500          Anchorage CoC   
5      1/1/2007    AK     AK-500          Anchorage CoC   
7      1/1/2007    AK     AK-500          Anchorage CoC   
8      1/1/2007    AK     AK-500          Anchorage CoC   
...         ...   ...        ...                    ...   
86518  1/1/2016    WY     WY-500  Wyoming Statewide CoC   
86519  1/1/2016    WY     WY-500  Wyoming Statewide CoC   
86523  1/1/2016    WY     WY-500  Wyoming Statewide CoC   
86524  1/1/2016    WY     WY-500  Wyoming Statewide CoC   
86525  1/1/2016    WY     WY-500  Wyoming Statewide CoC   

                                                Measures Count  
1                                   Homeless Individuals   696  
4                                     Sheltered Homeless   842  
5                         Sheltered Homeless Individuals   589  
7                                         

In [26]:
# In order to filtre with several categories we can use the isin method
print(
    homelessness[
        homelessness["State"].isin(["WY", "AK"])
    ]
)

Year State CoC Number               CoC Name  \
0      1/1/2007    AK     AK-500          Anchorage CoC   
1      1/1/2007    AK     AK-500          Anchorage CoC   
2      1/1/2007    AK     AK-500          Anchorage CoC   
3      1/1/2007    AK     AK-500          Anchorage CoC   
4      1/1/2007    AK     AK-500          Anchorage CoC   
...         ...   ...        ...                    ...   
86524  1/1/2016    WY     WY-500  Wyoming Statewide CoC   
86525  1/1/2016    WY     WY-500  Wyoming Statewide CoC   
86526  1/1/2016    WY     WY-500  Wyoming Statewide CoC   
86527  1/1/2016    WY     WY-500  Wyoming Statewide CoC   
86528  1/1/2016    WY     WY-500  Wyoming Statewide CoC   

                                                Measures Count  
0                       Chronically Homeless Individuals   224  
1                                   Homeless Individuals   696  
2                            Homeless People in Families   278  
3             Sheltered Chronically Homele

In [31]:
# combining everything
homeless_fam = homelessness[
    homelessness["Measures"] == "Homeless People in Families"
    ]

high_homeless_fam = homeless_fam[
    homeless_fam["Count"] > "100"
    ]
high_homess_fam_srt = high_homeless_fam.sort_values("Count", ascending= False)
result = high_homess_fam_srt[["State", "Count"]]
print(result)


State Count
9986     CA   998
7466     MO   998
70574    CA   997
35146    UT   997
24415    NJ   995
...     ...   ...
1034     FL   101
1142     FL   101
17078    NC   101
11414    MA   101
39493    MD   101

[3693 rows x 2 columns]
