# Mapping Inequality in Baltimore : Text Analysis

## Introduction 

## Gathering Data

In [5]:
# Loads the Pandas library 
import pandas as pd

# Creates data frame named df by reading in the Baltimore csv
df = pd.read_csv("AD_Data_BaltimoreProject.csv")
df.head(n=3)

Unnamed: 0,Form,State,City,Security_Grade,Area_Number,Terrain_Description,Favorable_Influences,Detrimental_Influences,INHABITANTS_Type,INHABITANTS_Annual_Income,...,INHABITANTS_Population_Increase,INHABITANTS_Population_Decrease,INHABITANTS_Population_Static,BUILDINGS_Types,BUILDINGS_Construction,BUILDINGS_Age,BUILDINGS_Repair,Ten_Fifteen_Desirability,Remarks,Date
0,NS FORM-8 6-1-37,Maryland,Baltimore,A,2,Rolling,Fairly new suburban area of homogeneous charac...,,Substantial Middle Class,"$3000 - 5,000",...,Fast,,,Detached an row houses,Brick and frame,1 to 10 years,Good,Upward,A recent development with much room for expans...,"May 4,1937"
1,NS FORM-8 6-1-37,Maryland,Baltimore,A,1,Undulating,Very nicely planned residential area of medium...,,"Executives, Professional Men",over $5000,...,Moderately Fast,,,Single family detached,Brick and Stone,12 years,Very good,Upward,Mostly fee properties. A few homes valued at $...,"May 4,1937"
2,NS FORM-8 6-1-37,Maryland,Baltimore,A,3,Rolling,Good residential area. Well planned.,Distance to City,"Executives, Professional Men",3500 - 7000,...,Moderately Fast,,,One family detached,"Brick, Stone, and Frame",1 to 20 years,Good to excellent,Upward,Principally fee property. This section lies in...,"May 4,1937"


In [6]:
df['Favorable_Influences']

0     Fairly new suburban area of homogeneous charac...
1     Very nicely planned residential area of medium...
2                  Good residential area. Well planned.
3                    Well planned development of fairly
4     Desirable residential section. Good quality, m...
5             To be developed as high grade development
6     Desirable old residential section. Principally...
7     Restricted, desirable residential section of m...
8     Good residential section of fast moving property.
9     Good suburban residential area of modest homes...
10    Good residential section of homogeneous charac...
11       Near Druid Hill Park. Close to center of city.
12          Stable residential section of modest homes.
13            Good residential area holding up in value
14    Fairly good residential section with all conve...
15    Desirable residential section, homogeneous as ...
16    Homogeneous development. Near to parks. Transp...
17    Near to Sparrows Point. Desirable resident

In [7]:
# Convert from type object to type string 
df['Favorable_Influences'] = df['Favorable_Influences'].astype(str)
# Case normalization
df['Favorable_Influences'] = df['Favorable_Influences'].str.lower()

In [8]:
df['Favorable_Influences'][0:15]

0     fairly new suburban area of homogeneous charac...
1     very nicely planned residential area of medium...
2                  good residential area. well planned.
3                    well planned development of fairly
4     desirable residential section. good quality, m...
5             to be developed as high grade development
6     desirable old residential section. principally...
7     restricted, desirable residential section of m...
8     good residential section of fast moving property.
9     good suburban residential area of modest homes...
10    good residential section of homogeneous charac...
11       near druid hill park. close to center of city.
12          stable residential section of modest homes.
13            good residential area holding up in value
14    fairly good residential section with all conve...
Name: Favorable_Influences, dtype: object

In [9]:
# remove punctuation 
df['Favorable_Influences'] = df['Favorable_Influences'].str.replace('[^\w\s]','')

In [10]:
df['Favorable_Influences'][0:15]

0     fairly new suburban area of homogeneous charac...
1     very nicely planned residential area of medium...
2                    good residential area well planned
3                    well planned development of fairly
4     desirable residential section good quality med...
5             to be developed as high grade development
6     desirable old residential section principally ...
7     restricted desirable residential section of mo...
8      good residential section of fast moving property
9     good suburban residential area of modest homes...
10    good residential section of homogeneous character
11         near druid hill park close to center of city
12           stable residential section of modest homes
13            good residential area holding up in value
14    fairly good residential section with all conve...
Name: Favorable_Influences, dtype: object

In [11]:
# tokenization - where every work is a token 
df['Favorable_Influences'] = df['Favorable_Influences'].str.split('[\W_]+')

In [12]:
df['Favorable_Influences_clean'] = df['Favorable_Influences']

In [13]:
df['Favorable_Influences_clean'][0:15]

0     [fairly, new, suburban, area, of, homogeneous,...
1     [very, nicely, planned, residential, area, of,...
2              [good, residential, area, well, planned]
3              [well, planned, development, of, fairly]
4     [desirable, residential, section, good, qualit...
5     [to, be, developed, as, high, grade, development]
6     [desirable, old, residential, section, princip...
7     [restricted, desirable, residential, section, ...
8     [good, residential, section, of, fast, moving,...
9     [good, suburban, residential, area, of, modest...
10    [good, residential, section, of, homogeneous, ...
11    [near, druid, hill, park, close, to, center, o...
12    [stable, residential, section, of, modest, homes]
13    [good, residential, area, holding, up, in, value]
14    [fairly, good, residential, section, with, all...
Name: Favorable_Influences_clean, dtype: object

In [14]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
df['Favorable_Influences_clean'].apply(lambda x: [item for item in x if item not in stop_words])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sarahagarrat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0     [fairly, new, suburban, area, homogeneous, cha...
1     [nicely, planned, residential, area, medium, l...
2              [good, residential, area, well, planned]
3                  [well, planned, development, fairly]
4     [desirable, residential, section, good, qualit...
5                 [developed, high, grade, development]
6     [desirable, old, residential, section, princip...
7     [restricted, desirable, residential, section, ...
8     [good, residential, section, fast, moving, pro...
9     [good, suburban, residential, area, modest, ho...
10    [good, residential, section, homogeneous, char...
11       [near, druid, hill, park, close, center, city]
12        [stable, residential, section, modest, homes]
13            [good, residential, area, holding, value]
14    [fairly, good, residential, section, convenien...
15    [desirable, residential, section, homogeneous,...
16    [homogeneous, development, near, parks, transp...
17    [near, sparrows, point, desirable, residen

In [15]:
from collections import Counter
df['word_count'] = df['Favorable_Influences_clean'].apply(Counter)

In [16]:
df['word_count'][0]

Counter({'': 1,
         'area': 2,
         'character': 1,
         'city': 1,
         'close': 1,
         'development': 1,
         'fairly': 2,
         'homogeneous': 1,
         'near': 2,
         'new': 1,
         'of': 1,
         'parks': 1,
         'planned': 1,
         'schools': 1,
         'shopping': 1,
         'suburban': 1,
         'to': 1,
         'well': 1})