Permalink
Browse files

feat: Add new structure and save file for later purpose

- Branch-files are stored in subfolder for better handling
- Master file is saved at end of notebook
  • Loading branch information...
andirs committed Oct 26, 2017
1 parent 31ce2c9 commit 41fa1e2bf836d7137f5578088cde3d39def163c1
@@ -21,21 +21,30 @@
"pd.set_option(\"display.max_columns\", 100)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loading Master Dataframe"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 46,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"DATA_URL = 'data/'\n",
"FILE_NAME = 'masterdf_20170920.csv'"
"# I renamed the Google Drive Folder to /data/ in my repo - access will change once moving to database\n",
"DATA_URL = '../data/' \n",
"FILE_NAME = 'masterdf_20170920.csv'\n",
"NEW_FILE_NAME = 'masterdf_20171026_andirs.csv'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 8,
"metadata": {
"collapsed": false
},
@@ -46,7 +55,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 9,
"metadata": {
"collapsed": false
},
@@ -340,7 +349,7 @@
"4 0.0 "
]
},
"execution_count": 4,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -351,7 +360,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 10,
"metadata": {
"collapsed": false
},
@@ -362,7 +371,7 @@
"195308"
]
},
"execution_count": 5,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -391,7 +400,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 11,
"metadata": {
"collapsed": true
},
@@ -404,7 +413,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 12,
"metadata": {
"collapsed": true
},
@@ -428,7 +437,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 13,
"metadata": {
"collapsed": false
},
@@ -439,7 +448,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 14,
"metadata": {
"collapsed": false
},
@@ -454,7 +463,7 @@
" '1301 TURK, San Francisco']"
]
},
"execution_count": 9,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@@ -472,7 +481,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 15,
"metadata": {
"collapsed": false
},
@@ -483,7 +492,7 @@
"45"
]
},
"execution_count": 10,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@@ -502,7 +511,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 16,
"metadata": {
"collapsed": false
},
@@ -570,31 +579,23 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 17,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"refetch = False # indicates whether location data should be fetched or stored should be used"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"45 addresses in the queue (Iteration 1)\n",
"23 addresses in the queue (Iteration 2)\n",
"12 addresses in the queue (Iteration 3)\n",
"7 addresses in the queue (Iteration 4)\n",
"4 addresses in the queue (Iteration 5)\n",
"2 addresses in the queue (Iteration 6)\n",
"2 addresses in the queue (Iteration 7)\n",
"2 addresses in the queue (Iteration 8)\n",
"2 addresses in the queue (Iteration 9)\n",
"2 addresses in the queue (Iteration 10)\n",
"Termination: 2 addresses couldn't be found\n"
]
}
],
"outputs": [],
"source": [
"refetch = False\n",
"if refetch:\n",
" geopy_address_dict = fetch_address_info(fire_station_adds, service='geopy', verbose=False)"
]
@@ -608,33 +609,19 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"45 addresses in the queue (Iteration 1)\n",
"24 addresses in the queue (Iteration 2)\n",
"13 addresses in the queue (Iteration 3)\n",
"6 addresses in the queue (Iteration 4)\n",
"3 addresses in the queue (Iteration 5)\n",
"2 addresses in the queue (Iteration 6)\n",
"1 addresses in the queue (Iteration 7)\n"
]
}
],
"outputs": [],
"source": [
"if refetch:\n",
" google_address_dict = fetch_address_info(fire_station_adds, service='google', verbose=False, timer=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 20,
"metadata": {
"collapsed": true
},
@@ -664,7 +651,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 21,
"metadata": {
"collapsed": false
},
@@ -680,7 +667,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 22,
"metadata": {
"collapsed": false
},
@@ -735,7 +722,7 @@
" '935 FOLSOM, San Francisco': (37.7794237, -122.4041048)}"
]
},
"execution_count": 16,
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
@@ -746,25 +733,31 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 23,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"addytest = get_lat_long(google_address_dict) # additional step for google results to get latitude and longitude values"
"# additional step for google results to get latitude and longitude values\n",
"google_address_dict = get_lat_long(google_address_dict)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Calculate Haversine Distance"
"### Calculate Haversine Distance\n",
"In the next step we'll use a simple distance formula to calculate distance between the retrieved points. Since we need to compute the distance for all points to all data bases this needs a few moments to complete.\n",
"\n",
"> \"The haversine [distance] determines the great-circle distance between two points on a sphere given their longitudes and latitudes.\" \n",
"\n",
"This is not the driving distance, which will be computed using one of the api offerings from either one of the big navigation players."
]
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 26,
"metadata": {
"collapsed": true
},
@@ -792,43 +785,37 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"pos1 = addytest[addytest.keys()[1]]\n",
"pos2 = addytest[addytest.keys()[2]]"
"from ast import literal_eval\n",
"def hav_all(row):\n",
" pos1 = literal_eval(row) # make sure the entry is a well formed tuple\n",
" min_distance = 12742.0 # diameter of earth in km as maximum distance\n",
" for fire_station in google_address_dict:\n",
" distance = haversine(pos1, google_address_dict[fire_station])\n",
" if distance < min_distance:\n",
" min_distance = distance\n",
" return min_distance"
]
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 44,
"metadata": {
"collapsed": false
"collapsed": true
},
"outputs": [],
"source": [
"def hav_all(row):\n",
" first_second = row.split(\",\")\n",
" first = float(first_second[0].split('(')[1])\n",
" second = float(first_second[1].split(')')[0])\n",
" pos1 = (first, second)\n",
" \n",
" min_distance = 12742.0 # diameter of earth in km\n",
" for fire_station in addytest:\n",
" distance = haversine(pos1, addytest[fire_station])\n",
" if distance < min_distance:\n",
" min_distance = distance\n",
" return min_distance\n",
" \n",
"df[\"distance_next_fire_dpt_hav\"] = df[\"Location_y\"].apply(hav_all)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 45,
"metadata": {
"collapsed": false
},
@@ -847,7 +834,7 @@
"Name: distance_next_fire_dpt_hav, dtype: float64"
]
},
"execution_count": 25,
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
@@ -856,6 +843,17 @@
"df[\"distance_next_fire_dpt_hav\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df.to_csv(os.path.join(DATA_URL, NEW_FILE_NAME))"
]
},
{
"cell_type": "markdown",
"metadata": {

0 comments on commit 41fa1e2

Please sign in to comment.