# Analysis of Student Data for Union College Math Majors

## Part 5a - Geographic data

* Converting student home town addresses to longitude and latitude for mapping in part 5b

### Set up the notebook environment

In [1]:
# Imports
import geocoder as geo
import pandas as pd
import time

In [2]:
# Settings
pd.set_option('max_rows', 7)
pd.set_option('max_columns', 50)

### Geocoding 

In [3]:
# Load student info data from excel file
info = pd.read_excel('data/union_students_818_cleaned.xlsx', 
                     sheet_name='info',
                     usecols=[0, 13, 14]
                    )

# Format for geocoding
#   Replace Na with empty strings
info.fillna('', inplace=True) 
#   Exclude numbers from the end of home_city
info['home_city'] = info['home_city'].str.replace('[0-9]+$', '')

In [4]:
# Load previously identified coordinates to avoid repeat requests
try:
    known = pd.read_csv('data/map_data.csv')
except FileNotFoundError:
    known = pd.DataFrame(columns=['id', 'lat', 'lng'])
    
info = info.merge(known, on='id', how='left')

In [5]:
unknowns = info.loc[(info['lat'].isnull()) | (info['lng'].isnull())]

In [6]:
# Get latitude and longitude for home city and state by student id
def home_latlng(row):
    ''' Returns latitude, longitude coordinates for data frame with columns
        home_city and home_state '''
    
    time.sleep(0.1)
    address = ' '.join([str(row.home_city), str(row.home_state)])
    g = geo.osm(address)
    row['lat'] = g.lat
    row['lng'] = g.lng
    return row
    
new_codes = (unknowns.apply(home_latlng, axis=1))
info.update(new_codes)

In [7]:
# Check if any entries were not found
info.loc[(info['lat'].isnull()) | (info['lng'].isnull())]


Unnamed: 0,id,home_city,home_state,lat,lng


In [8]:
# Save to csv file
info[['id', 'lat', 'lng']].to_csv('data/map_data.csv', index=False)