## Generate UFO dataset 

### This notebook uses raw UFO data from https://github.com/planetsig/ufo-reports and cleans it up. Exports the clean data into json file to be used by the UFO app.

### The data was scrubbed and already clean to begin with, this notebook further organises it to best fit this specific project. 

In [59]:
import pandas as pd
import pymongo


In [37]:
df = pd.read_csv("data/scrubbed.csv", sep=',', error_bad_lines=False, index_col=False, dtype='unicode') 

In [38]:
df.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.9411111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.6458333
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.8036111


In [39]:
# Change to Camel Case

df["city"]= df["city"].str.upper().str.title() 
df["state"]= df["state"].str.upper()
df["country"]= df["country"].str.upper()
df["shape"]= df["shape"].str.upper().str.title() 

In [40]:
df.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,San Marcos,TX,US,Cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.9411111
1,10/10/1949 21:00,Lackland Afb,TX,,Light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,Chester (Uk/England),,GB,Circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,Edna,TX,US,Circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.6458333
4,10/10/1960 20:00,Kaneohe,HI,US,Light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.8036111


In [41]:
df[['date','time']] = df.datetime.str.split(expand=True) 

In [42]:
# Remove duration(hours/min) and date posted
df = df.drop(columns=['duration (hours/min)', 'date posted'])

In [55]:
# Filter only for United States
df = df[df["country"] == "US"]

In [56]:
# Arange into dictionary
df_dict = df.to_dict('records')

In [57]:
# Export as a json to be used by the javascript script
import json
with open('data/ufo_data.json', 'w') as outfile:
    json.dump(df_dict, outfile)

In [66]:
df_dict[0:5]

[{'datetime': '10/10/1949 20:30',
  'city': 'San Marcos',
  'state': 'TX',
  'country': 'US',
  'shape': 'Cylinder',
  'duration (seconds)': '2700',
  'comments': 'This event took place in early fall around 1949-50. It occurred after a Boy Scout meeting in the Baptist Church. The Baptist Church sit',
  'latitude': '29.8830556',
  'longitude ': '-97.9411111',
  'date': '10/10/1949',
  'time': '20:30',
  '_id': ObjectId('5d896c27e6d7c94cdb8a36bc')},
 {'datetime': '10/10/1956 21:00',
  'city': 'Edna',
  'state': 'TX',
  'country': 'US',
  'shape': 'Circle',
  'duration (seconds)': '20',
  'comments': 'My older brother and twin sister were leaving the only Edna theater at about 9 PM&#44...we had our bikes and I took a different route home',
  'latitude': '28.9783333',
  'longitude ': '-96.6458333',
  'date': '10/10/1956',
  'time': '21:00',
  '_id': ObjectId('5d896c27e6d7c94cdb8a36bd')},
 {'datetime': '10/10/1960 20:00',
  'city': 'Kaneohe',
  'state': 'HI',
  'country': 'US',
  'shape': '

In [67]:
# Set up Mongo Connection
conn = "mongodb://localhost:27017"
client = pymongo.MongoClient(conn)

In [68]:
## Set up Mongo Database and Collection
db = client.ufo
db.ufo_data.drop()

In [69]:
# Insert data into Mongo
db.ufo_data.insert_many(df_dict)

<pymongo.results.InsertManyResult at 0x12ca6a048>

In [70]:
from pprint import pprint

In [71]:
# Get top 10 records from UFO mongo collection
data = list(db.ufo_data.find().limit(10))
pprint(data)

[{'_id': ObjectId('5d896c27e6d7c94cdb8a36bc'),
  'city': 'San Marcos',
  'comments': 'This event took place in early fall around 1949-50. It occurred '
              'after a Boy Scout meeting in the Baptist Church. The Baptist '
              'Church sit',
  'country': 'US',
  'date': '10/10/1949',
  'datetime': '10/10/1949 20:30',
  'duration (seconds)': '2700',
  'latitude': '29.8830556',
  'longitude ': '-97.9411111',
  'shape': 'Cylinder',
  'state': 'TX',
  'time': '20:30'},
 {'_id': ObjectId('5d896c27e6d7c94cdb8a36bd'),
  'city': 'Edna',
  'comments': 'My older brother and twin sister were leaving the only Edna '
              'theater at about 9 PM&#44...we had our bikes and I took a '
              'different route home',
  'country': 'US',
  'date': '10/10/1956',
  'datetime': '10/10/1956 21:00',
  'duration (seconds)': '20',
  'latitude': '28.9783333',
  'longitude ': '-96.6458333',
  'shape': 'Circle',
  'state': 'TX',
  'time': '21:00'},
 {'_id': ObjectId('5d896c27e6d7c94c

In [85]:
# Check collection document count and verify it matches the rows of pd dataframe
db.ufo_data.count_documents({}, )

65114

In [76]:
db.list_collection_names()

['ufo_data']