##Processing for organization dataset
Authors: Sanjana and Rachel

Needed files: 

*   interest_areas.txt
*   interest_categories.txt
*   org_dataset.csv
*   encoded_org_dataset.csv




####Import libraries

In [455]:
import pandas as pd
import numpy as np
from numpy.random import choice
import copy
import re
import io

### Process age columns into single majority age group designation
Author: Sanjana

In [482]:
df = pd.read_csv('/content/org_dataset.csv')

In [483]:
print(f"df type: {type(df)}")
print(f"df shape: {df.shape}")

df type: <class 'pandas.core.frame.DataFrame'>
df shape: (544, 23)


In [484]:
df.columns

Index(['Organization/ City Agency/ Division Name',
       'Street Address/Mailing Address', 'City', 'State', 'Postcode',
       'Year Surveyed', 'Total Vounteers', 'Youth volunteers',
       'Adult Volunteers', 'Older adult Volunteers', 'Organization Type',
       'Interest Areas', 'Special Populations Served', 'Boroughs  Served',
       'Borough', 'Latitude', 'Longitude', 'Community Board',
       'Council District', 'Census Tract', 'BIN', 'BBL', 'NTA'],
      dtype='object')

Deep copy selected columns 

In [485]:
organization = copy.deepcopy(df['Organization/ City Agency/ Division Name'])
total_vol = copy.deepcopy(df['Total Vounteers'])
youth_vol = copy.deepcopy(df['Youth volunteers'])
adult_vol = copy.deepcopy(df['Adult Volunteers'])
older_vol = copy.deepcopy(df['Older adult Volunteers'])

Clean up volunteer age data 

In [486]:
for i in range(len(total_vol)):
  total_vol[i] = total_vol[i].replace(',', '')
  youth_vol[i] = youth_vol[i].replace(',','')
  youth_vol[i] = youth_vol[i].replace('-','0')
  adult_vol[i] = adult_vol[i].replace(',','')
  adult_vol[i] = adult_vol[i].replace('-','0')
  older_vol[i] = older_vol[i].replace(',','')
  older_vol[i] = older_vol[i].replace('-','0')

Lists

In [487]:
total_vol = [float(i) for i in total_vol]
youth_vol = [float(i) for i in youth_vol]
adult_vol = [float(i) for i in adult_vol]
older_vol = [float(i) for i in older_vol]

organizationZero =[]

Delete all zero entries from lists

Keep track of totall zero entries

In [488]:
total_Zeros = 0

for i in range(len(total_vol)-1, -1, -1):     # iterate backwards
  if youth_vol[i] == 0 and adult_vol[i] == 0 and older_vol[i] == 0: 

    total_Zeros +=1
    organizationZero.append(organization[i])
    

    del(total_vol[i])
    del(youth_vol[i])
    del(adult_vol[i])
    del(older_vol[i])
    del(organization[i])

Correct given age data

In [489]:
dictIn = {"Total volunteers" : total_vol, "Youth volunteers": youth_vol, "Adult volunteers" : adult_vol, "Older volunteers" : older_vol}

In [490]:
df2 = pd.DataFrame(dictIn)
df2_vols = df2.loc[:,df2.columns != 'Total volunteers']

Get max volunteers in age groups for each organization 

In [491]:
maxList = []
maxList = re.findall('[A-Z][^A-Z]*',df2_vols.idxmax(axis=1).sum())

In [492]:
for i in maxList:
  if (i == 'Youth volunteers'):
    maxList[maxList.index(i)] = 0
  
  elif(i == 'Adult volunteers'):
   maxList[maxList.index(i)] = 1

  elif(i == 'Older volunteers'):
    maxList[maxList.index(i)] = 2


Generate Age data for Zero entries with respect to age demographics in NYC

In [493]:
from numpy.random import choice

ageList = [0,1,2]

generatedAgeArray = choice(ageList,total_Zeros, p=[0.10,0.78,0.12]) 

Join given and generated data  

In [494]:
for i in generatedAgeArray:
  maxList.append(generatedAgeArray[i])


In [495]:
organizationList = []
for i in organization:
  organizationList.append(str(i))

for i in organizationZero:
  organizationList.append(i)

In [496]:
dictAge = {"Organization/ City Agency/ Division Name" : organizationList, "Most volunteers in age group" : maxList}

In [497]:
df_finalAge = pd.DataFrame(dictAge)

Final age dataset 

In [498]:
df_finalAge

Unnamed: 0,Organization/ City Agency/ Division Name,Most volunteers in age group
0,"20/20 Vision for Schools, Inc. d/b/a Thrive Co...",1
1,826NYC,1
2,9/11 Memorial & Museum,0
3,A Free Bird,1
4,Abbott House,1
...,...,...
539,Behind the Book,1
540,Avenues: The World School,1
541,Association to Benefit Children,1
542,AHRCNYC,1


### Process interests into categories
Author: Rachel

In [499]:
# open files
areas_file = open('/content/interest_areas.txt', 'r')   # list of all interests w/ category designations
cat_file = open('/content/interest_categories.txt', 'r')    # list of all categories w/ numbers

####Process interest areas list

In [500]:
# for areas file
lines = areas_file.read()
interests_list = lines.split('\n')

In [501]:
for i in range(len(interests_list)):
  interests_list[i] = interests_list[i].split('=')
  interests_list[i][1] = interests_list[i][1].split(',')

interests_list

[['Arts and culture', ['7', '9', '5']],
 ['Children and youth related', ['10', '9']],
 ['Education and literacy', ['9', '5']],
 ['Memorial', ['2']],
 ['Employment and workforce development', ['2', '9']],
 ['Health and medicine', ['1']],
 ['Immigrants and/or refugees', ['3']],
 ['LGBTQIA+', ['3', '9']],
 ['People with disabilities', ['1', '2']],
 ['Advocacy and organizing', ['7', '3']],
 ['Civic engagement (voting, voter registration)', ['2', '3']],
 ['Human services', ['2']],
 ['Senior Services', ['2', '1']],
 ['Mental Health Services', ['1', '2']],
 ['Emergency management and disaster response', ['9']],
 ['Justice-related', ['3']],
 ['Legal services', ['3']],
 ['Womens issues', ['2', '3']],
 ['Environment and sustainability', ['6']],
 ['Homeless and shelter', ['2', '3']],
 ['Human rights', ['3']],
 ['Care Management', ['1', '2']],
 ['Animals', ['3']],
 ['Faith-based', ['9']],
 ['Emergency Food Program/Counseling', ['9', '2']],
 ['Introducing travelers to New York City', ['9']],
 ['San

In [502]:
# create list with all interests for each category
categories_list = []

for i in range(10):
  temp_list = []
  for j in range(len(interests_list)):
    if str(i+1) in interests_list[j][1]:
      temp_list.append(interests_list[j][0])
  categories_list.append(temp_list)

categories_list

[['Health and medicine',
  'People with disabilities',
  'Senior Services',
  'Mental Health Services',
  'Care Management',
  'Senior Services',
  'Wellness And Fitness',
  'HIV+ Individuals',
  'Home Care',
  'Healthy Eating & Exerciseg',
  'providing medically tailored meals to people living with serious illnesses',
  'Emergency food',
  'Senior Services',
  'telephone reassurance',
  'nutrition education',
  'Mental Health',
  'mental health',
  'Public Health',
  'Recreation and Mental Health',
  'Sexual',
  'mental health',
  'substance misuse',
  'relationship building',
  'reflection',
  'Family Caregivers',
  'LGBTQIA+ affirming peer based mental health support',
  'Behavioral Health / Substance Abuse',
  'Strengthening isolated families',
  'Victim Services',
  'Holocaust Survivors',
  'Caregiver services',
  'HOLISTIC WELLNESS',
  'Support active and retired military',
  'Community Based Mental Health',
  'Health & Wellness'],
 ['Memorial',
  'Employment and workforce develo

####Process category file

In [503]:
lines = cat_file.read()
short_cat_list = lines.split('\n')

for i in range(len(short_cat_list)):
  short_cat_list[i] = short_cat_list[i].split('*')

short_cat_list

[['1', 'Health services'],
 ['2', 'Social services'],
 ['3', 'Advocacy and Justice'],
 ['4', 'Food and Agriculture'],
 ['5', 'Education'],
 ['6', 'Environment and Climate change'],
 ['7', 'Recreation and Leisure'],
 ['8', 'Technology and Innovation'],
 ['9', 'Outreach and Advertising'],
 ['10', 'Child services']]

####Modify dataset

In [504]:
org_df = pd.read_csv('/content/org_dataset.csv')

In [505]:
interest_areas = org_df['Interest Areas']
cat_col = []

for i in range(len(interest_areas)):
  temp_list = []
  interests = ''
  for entry in categories_list:   # go through each entry in the interests listed by category
    for item in entry:    # go through every interest area in each category
      if item in repr(interest_areas[i]):
        temp_list.append(short_cat_list[categories_list.index(entry)][1])

  # convert to dictionary and back to remove duplicates     
  temp_dict = dict.fromkeys(temp_list)
  temp_list = temp_dict.keys()
  
  # put into string separated by commas
  for interest in temp_list:
    interests = interests + interest + ';'
  cat_col.append(interests.rstrip(';'))

# list type
cat_col

['Education;Recreation and Leisure;Outreach and Advertising;Child services',
 'Education;Outreach and Advertising;Child services',
 'Social services;Education;Recreation and Leisure;Outreach and Advertising',
 'Education;Recreation and Leisure;Outreach and Advertising;Child services',
 'Health services;Social services;Advocacy and Justice;Outreach and Advertising;Child services',
 'Health services;Social services;Advocacy and Justice;Education;Recreation and Leisure;Outreach and Advertising',
 'Health services;Social services;Advocacy and Justice;Education;Outreach and Advertising;Child services',
 'Social services;Advocacy and Justice;Education;Recreation and Leisure;Outreach and Advertising;Child services',
 'Health services;Social services;Advocacy and Justice;Education;Recreation and Leisure;Outreach and Advertising;Child services',
 'Social services;Advocacy and Justice;Environment and Climate change;Recreation and Leisure;Outreach and Advertising;Child services',
 'Health service

In [506]:
org_df['Interest Categories'] = cat_col
org_df.head()

Unnamed: 0,Organization/ City Agency/ Division Name,Street Address/Mailing Address,City,State,Postcode,Year Surveyed,Total Vounteers,Youth volunteers,Adult Volunteers,Older adult Volunteers,...,Borough,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA,Interest Categories
0,"20/20 Vision for Schools, Inc. d/b/a Thrive Co...",8225 5th Avenue #323,Brooklyn,NY,11209,2019,500,200,260,40,...,BROOKLYN,40.624082,-74.024838,310.0,43.0,142.0,3152153.0,3060090000.0,Bay Ridge,Education;Recreation and Leisure;Outreach and ...
1,826NYC,372 Fifth Ave,Brooklyn,NY,11215,2019,803,0,759,44,...,BROOKLYN,40.671365,-73.984492,306.0,39.0,137.0,3021351.0,3009870000.0,Park Slope-Gowanus,Education;Outreach and Advertising;Child services
2,9/11 Memorial & Museum,"200 Liberty St, 16th floor",New York,NY,10281,2019,774,38,0,0,...,MANHATTAN,40.711425,-74.015442,101.0,1.0,31704.0,1000057.0,1000160000.0,Battery Park City-Lower Manhattan,Social services;Education;Recreation and Leisu...
3,A Free Bird,146 Smith Street,Brooklyn,NY,11201,2019,58,0,58,0,...,BROOKLYN,40.686827,-73.990412,302.0,33.0,43.0,3001229.0,3001930000.0,DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill,Education;Recreation and Leisure;Outreach and ...
4,Abbott House,1775 Grandconcourse,The Bronx,NY,10453,2019,2,0,2,0,...,BRONX,40.846089,-73.909879,205.0,14.0,22701.0,2007858.0,2028228000.0,Mount Hope,Health services;Social services;Advocacy and J...


In [507]:
areas_file.close()
cat_file.close()

### Combining dataframes for final organization dataset
Authors: Sanjana and Rachel

#### Merge dataframes

In [508]:
df4 = pd.merge(org_df,df_finalAge, on="Organization/ City Agency/ Division Name")

df4.head()

Unnamed: 0,Organization/ City Agency/ Division Name,Street Address/Mailing Address,City,State,Postcode,Year Surveyed,Total Vounteers,Youth volunteers,Adult Volunteers,Older adult Volunteers,...,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA,Interest Categories,Most volunteers in age group
0,"20/20 Vision for Schools, Inc. d/b/a Thrive Co...",8225 5th Avenue #323,Brooklyn,NY,11209,2019,500,200,260,40,...,40.624082,-74.024838,310.0,43.0,142.0,3152153.0,3060090000.0,Bay Ridge,Education;Recreation and Leisure;Outreach and ...,1
1,826NYC,372 Fifth Ave,Brooklyn,NY,11215,2019,803,0,759,44,...,40.671365,-73.984492,306.0,39.0,137.0,3021351.0,3009870000.0,Park Slope-Gowanus,Education;Outreach and Advertising;Child services,1
2,9/11 Memorial & Museum,"200 Liberty St, 16th floor",New York,NY,10281,2019,774,38,0,0,...,40.711425,-74.015442,101.0,1.0,31704.0,1000057.0,1000160000.0,Battery Park City-Lower Manhattan,Social services;Education;Recreation and Leisu...,0
3,A Free Bird,146 Smith Street,Brooklyn,NY,11201,2019,58,0,58,0,...,40.686827,-73.990412,302.0,33.0,43.0,3001229.0,3001930000.0,DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill,Education;Recreation and Leisure;Outreach and ...,1
4,Abbott House,1775 Grandconcourse,The Bronx,NY,10453,2019,2,0,2,0,...,40.846089,-73.909879,205.0,14.0,22701.0,2007858.0,2028228000.0,Mount Hope,Health services;Social services;Advocacy and J...,1


In [509]:
df4.columns

Index(['Organization/ City Agency/ Division Name',
       'Street Address/Mailing Address', 'City', 'State', 'Postcode',
       'Year Surveyed', 'Total Vounteers', 'Youth volunteers',
       'Adult Volunteers', 'Older adult Volunteers', 'Organization Type',
       'Interest Areas', 'Special Populations Served', 'Boroughs  Served',
       'Borough', 'Latitude', 'Longitude', 'Community Board',
       'Council District', 'Census Tract', 'BIN', 'BBL', 'NTA',
       'Interest Categories', 'Most volunteers in age group'],
      dtype='object')

In [510]:
df4.drop(['Street Address/Mailing Address', 'City', 'State', 'Postcode',
       'Year Surveyed', 'Total Vounteers', 'Youth volunteers',
       'Adult Volunteers', 'Older adult Volunteers', 'Interest Areas', 'Special Populations Served', 'Boroughs  Served',
       'Latitude', 'Longitude', 'Community Board',
       'Council District', 'Census Tract', 'BIN', 'BBL', 'NTA'], axis=1, inplace=True)

Note: column 'Organization Type" could be useful for displaying in front-end

In [511]:
final_df = df4.loc[:,df4.columns != 'Organization Type']

####Handling missing values

In [512]:
final_df.isna().sum()

Organization/ City Agency/ Division Name     0
Borough                                     44
Interest Categories                          0
Most volunteers in age group                 0
dtype: int64

In [513]:
borough = copy.deepcopy(final_df['Borough'])

Boroughs of missing values with respect to locations of organizations 

In [514]:
generatedBoroughList = [ "MANHATTAN", "BROOKLYN", "QUEENS", "MANHATTAN", "BRONX", "BROOKLYN", "BROOKLYN", "BRONX", "QUEENS", "BROOKLYN", "QUEENS", "MANHATTAN","MANHATTAN","MANHATTAN","MANHATTAN","MANHATTAN","MANHATTAN","BRONX", "QUEENS", "QUEENS", "BROOKLYN", "MANHATTAN","BRONX","MANHATTAN","MANHATTAN","MANHATTAN","MANHATTAN","MANHATTAN","BRONX", "QUEENS", "MANHATTAN", "BROOKLYN", "MANHATTAN","MANHATTAN", "QUEENS", "QUEENS","QUEENS","QUEENS","BRONX","MANHATTAN","MANHATTAN", "MANHATTAN","QUEENS", "MANHATTAN"]
                        

In [515]:
naNIndex = []
for i in range(len(borough)):
  if not (isinstance(borough[i],str)):
    naNIndex.append(i)


Insert researched boroughs into borough list

In [516]:
count = 0
for i in naNIndex:
  if count < 44:
    borough[i] = generatedBoroughList[count]
    count +=1

In [517]:
boroughList = []
for i in range(len(borough)):
    boroughList.append(borough[i])

Remove previous Borough column with missing values

In [518]:
final_df.drop('Borough', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Insert new Borough column 

In [519]:
final_df.insert(1,'Borough',boroughList)

Check

In [520]:
final_df.isna().sum()

Organization/ City Agency/ Division Name    0
Borough                                     0
Interest Categories                         0
Most volunteers in age group                0
dtype: int64

Final pre encoded organization dataset

In [521]:
final_df

Unnamed: 0,Organization/ City Agency/ Division Name,Borough,Interest Categories,Most volunteers in age group
0,"20/20 Vision for Schools, Inc. d/b/a Thrive Co...",BROOKLYN,Education;Recreation and Leisure;Outreach and ...,1
1,826NYC,BROOKLYN,Education;Outreach and Advertising;Child services,1
2,9/11 Memorial & Museum,MANHATTAN,Social services;Education;Recreation and Leisu...,0
3,A Free Bird,BROOKLYN,Education;Recreation and Leisure;Outreach and ...,1
4,Abbott House,BRONX,Health services;Social services;Advocacy and J...,1
...,...,...,...,...
541,Young Israel Senior Services of Midwood,BROOKLYN,Health services;Social services,2
542,"Young Men Strong, LLC",QUEENS,Social services;Advocacy and Justice;Education...,1
543,Youth Action YouthBuild East Harlem,MANHATTAN,Social services;Education;Outreach and Adverti...,1
544,"Youth, Inc.",MANHATTAN,Outreach and Advertising;Child services,1


#### Save to CSV file

In [522]:
#final_df.to_csv("/content/Preencoding_final_dataframe.csv")

In [523]:
#final_df = pd.read_csv("/content/Preencoding_final_dataframe.csv")

####One-hot Encoding

Encoding boroughs

In [524]:
one_hotBoroughs = pd.get_dummies(final_df['Borough'])

In [525]:
final_df = final_df.join(one_hotBoroughs)

In [526]:
final_df = final_df.loc[:,final_df.columns != 'Unnamed']

Encoding interests

In [527]:
interests = final_df['Interest Categories']

zero_column = []
for i in range(len(interests)):
  zero_column.append(0)

one_hot_list = []
for i in range(10):
  one_hot_list.append(copy.deepcopy(zero_column))

categories = []
for item in short_cat_list:
  categories.append(item[1])


In [528]:
for i in range(len(interests)):
  interests_list = interests[i].split(';')
  for item in interests_list:
    if item != '':
      one_hot_list[categories.index(item)][i] = 1

interests_dict = {'Health services': one_hot_list[0],
                  'Social services': one_hot_list[1],
                  'Advocacy and Justice': one_hot_list[2],
                  'Food and Agriculture': one_hot_list[3],
                  'Education': one_hot_list[4],
                  'Environment and Climate change': one_hot_list[5],
                  'Recreation and Leisure': one_hot_list[6],
                  'Technology and Innovation': one_hot_list[7],
                  'Outreach and Advertising': one_hot_list[8],
                  'Child services': one_hot_list[9]}

In [529]:
interests_df = pd.DataFrame.from_dict(interests_dict) 

In [530]:
interests_df['Organization/ City Agency/ Division Name'] = final_df['Organization/ City Agency/ Division Name']

In [531]:
encoded_df = pd.merge(interests_df, final_df, on='Organization/ City Agency/ Division Name')

In [532]:
encoded_df

Unnamed: 0,Health services,Social services,Advocacy and Justice,Food and Agriculture,Education,Environment and Climate change,Recreation and Leisure,Technology and Innovation,Outreach and Advertising,Child services,Organization/ City Agency/ Division Name,Borough,Interest Categories,Most volunteers in age group,BRONX,BROOKLYN,MANHATTAN,QUEENS,STATEN IS
0,0,0,0,0,1,0,1,0,1,1,"20/20 Vision for Schools, Inc. d/b/a Thrive Co...",BROOKLYN,Education;Recreation and Leisure;Outreach and ...,1,0,1,0,0,0
1,0,0,0,0,1,0,0,0,1,1,826NYC,BROOKLYN,Education;Outreach and Advertising;Child services,1,0,1,0,0,0
2,0,1,0,0,1,0,1,0,1,0,9/11 Memorial & Museum,MANHATTAN,Social services;Education;Recreation and Leisu...,0,0,0,1,0,0
3,0,0,0,0,1,0,1,0,1,1,A Free Bird,BROOKLYN,Education;Recreation and Leisure;Outreach and ...,1,0,1,0,0,0
4,1,1,1,0,0,0,0,0,1,1,Abbott House,BRONX,Health services;Social services;Advocacy and J...,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
553,1,1,0,0,0,0,0,0,0,0,Young Israel Senior Services of Midwood,BROOKLYN,Health services;Social services,2,0,1,0,0,0
554,0,1,1,0,1,0,1,0,1,1,"Young Men Strong, LLC",QUEENS,Social services;Advocacy and Justice;Education...,1,0,0,0,1,0
555,0,1,0,0,1,0,0,0,1,0,Youth Action YouthBuild East Harlem,MANHATTAN,Social services;Education;Outreach and Adverti...,1,0,0,1,0,0
556,0,0,0,0,0,0,0,0,1,1,"Youth, Inc.",MANHATTAN,Outreach and Advertising;Child services,1,0,0,1,0,0


Save encoded dataframe to CSV file

In [533]:
#encoded_df.to_csv('/content/encoded_org_final_dataset.csv')