## Looking at the World UFO sightings
I was keen to look at a data set that had

In [None]:
#For Go charts
!pip install chart_studio

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

import geopandas as gpd # not used?

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import folium
from folium import Choropleth
from folium.plugins import HeatMap

#plotly graphing
# plotly
import chart_studio.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

Loading the dataset

In [None]:
# two fields were mixed type and should have been numeric.  To reduce warnings these fields were initially imported as strings.
df = pd.read_csv('/kaggle/input/ufo-sightings-around-the-world/ufo_sighting_data.csv', dtype={"length_of_encounter_seconds": "string", "latitude": "string"})
print('World UFO sightings \n')
print(df.columns.values)
df.head()

The dataset contains the reported UFO sighting across the world.
Each record contains the time of the sighting, the state, country, the duration of the event and a description.  The records are also categorised by the shape of the object sighted.  The sighting have also been categorised by shape.

### Data cleanup
The latatude and sighting durations were converted to numeric values as there were instances where the values were not numeric. 

In [None]:
df["latitude"] = pd.to_numeric(df["latitude"], errors="coerce")
df["length_of_encounter_seconds"] = pd.to_numeric(df["length_of_encounter_seconds"], errors="coerce")
df.dropna(axis=0,inplace=True)
pd.options.display.float_format = '{:20,.2f}'.format
df.describe()

An intial analysis of the numerica data shows some of the sightings to be very long indeed.  The longest being 3 years.  We can see the duration has a very large spread and 75% of the durations are less than 600 seconds.

## Dates of the sightings
To start the date analysis we need to make the date values as date objects.  We also need to correct a few values eg. the time 24:00 we will take as 00:00

In [None]:
# Fix dates
df[['date','time']] = df['Date_time'].str.split(expand=True)
df['datetime'] = (pd.to_datetime(df.pop('date'), format='%m/%d/%Y') + 
                  pd.to_timedelta(df.pop('time') + ':00'))
df['year'] = pd.DatetimeIndex(df['datetime']).year


## Shapes in the sky
What were the shapes recorded?  There is a variety of descriptions of the shapes.  The shapes are most frequently described as lights, circles or triangles.

In [None]:
sns.catplot(y="UFO_shape", kind="count",
            palette="pastel", edgecolor=".6",
            data=df.sort_values("UFO_shape"))

We probably can group some of these descriptions together to simply the categories

In [None]:
df['UFO_shape'] = np.where((df.UFO_shape == 'circle'),'disk',df.UFO_shape)
df['UFO_shape'] = np.where((df.UFO_shape == 'oval'),'disk',df.UFO_shape)
df['UFO_shape'] = np.where((df.UFO_shape == 'round'),'disk',df.UFO_shape)

df['UFO_shape'] = np.where((df.UFO_shape == 'cigar'),'cylinder',df.UFO_shape)
df['UFO_shape'] = np.where((df.UFO_shape == 'cone'),'cylinder',df.UFO_shape)
df['UFO_shape'] = np.where((df.UFO_shape == 'crescent'),'cylinder',df.UFO_shape)

df['UFO_shape'] = np.where((df.UFO_shape == 'sphere'),'ball',df.UFO_shape)
df['UFO_shape'] = np.where((df.UFO_shape == 'fireball'),'ball',df.UFO_shape)

df['UFO_shape'] = np.where((df.UFO_shape == 'teardrop'),'egg',df.UFO_shape)

df['UFO_shape'] = np.where((df.UFO_shape == 'rectangle'),'geometric',df.UFO_shape)
df['UFO_shape'] = np.where((df.UFO_shape == 'chevron'),'geometric',df.UFO_shape)
df['UFO_shape'] = np.where((df.UFO_shape == 'triangle'),'geometric',df.UFO_shape)
df['UFO_shape'] = np.where((df.UFO_shape == 'diamond'),'geometric',df.UFO_shape)
df['UFO_shape'] = np.where((df.UFO_shape == 'cross'),'geometric',df.UFO_shape)
df['UFO_shape'] = np.where((df.UFO_shape == 'delta'),'geometric',df.UFO_shape)
df['UFO_shape'] = np.where((df.UFO_shape == 'hexagon'),'geometric',df.UFO_shape)
df['UFO_shape'] = np.where((df.UFO_shape == 'pyramid'),'geometric',df.UFO_shape)

df['UFO_shape'] = np.where((df.UFO_shape == 'changed'),'changing',df.UFO_shape)
df['UFO_shape'] = np.where((df.UFO_shape == 'formation'),'changing',df.UFO_shape)

df['UFO_shape'] = np.where((df.UFO_shape == 'unknown'),'other',df.UFO_shape)

df['UFO_shape'] = np.where((df.UFO_shape == 'flash'),'light',df.UFO_shape)
df['UFO_shape'] = np.where((df.UFO_shape == 'flare'),'light',df.UFO_shape)
df_full = df

The lights are the most common description followed by disk and balls

In [None]:
sns.catplot(y="UFO_shape", kind="count",
            palette="pastel", edgecolor=".6",
            data=df.sort_values("UFO_shape"))

The earlier numerical analysis shows that the majoritiy of sighting are less that 600 seconds.  To see a usable distribution we need to cut the tail off the duration, through a little but of trial and error a 4000 second threshold has been given to classifify 2529 of 65,000 sighting.

In [None]:
df_anomaly = df[df["length_of_encounter_seconds"] > 4000]
df_anomaly.count()

The table below shows some of the descriptions of the anomolously long sightings.

In [None]:
df_anomaly.head()

In [None]:
df = df[df["length_of_encounter_seconds"] <= 4000]
df.describe()

## Durations of the sighting in seconds

In [None]:
hist = df["length_of_encounter_seconds"].hist(bins=20)

## Is there a difference between long and short sightings?
The spread of the shapes betweent the long sighting and the rest of the data is very similar with the light being the most common description.

In [None]:
sns.catplot(y="UFO_shape", kind="count", 
            palette="pastel", edgecolor=".6",
            data=df.sort_values("UFO_shape"))

In [None]:
sns.catplot(y="UFO_shape", kind="count",
            palette="pastel", edgecolor=".6",
            data=df_anomaly.sort_values("UFO_shape"))

## Sightings over the years
I struggled on this graph..  my tip to my future self, put the sighting in order.

I borrowed a from Abigail Larion's notebook 
[UFO Reports in United States](https://www.kaggle.com/abigaillarion/ufo-reports-in-united-states)

There's another [great notebook by Jonathan Bouchet](https://www.kaggle.com/jonathanbouchet/e-t-phone-home-but-mostly-after-8-00pm) that points out the where shows such as X-Files start airing to the world..  Of course this might have be that the show is the result of lots of sightings..  or just possibly that the sightings are the result of the shows?

In [None]:
# UFO sightings per year 
"""From https://www.kaggle.com/abigaillarion/ufo-reports-in-united-states"""
df_full = df_full.sort_values(['year'])
ufo_peryear = np.asarray(df_full[df_full.year > 0].groupby('year').year.count())
# UFO sightings in 2014 estimated, data published in June 2014
ufo_peryear[-1] = ufo_peryear[-1] * 3

ufo_years = np.asarray(df_full[df_full.year > 0].year.unique())

trace = [go.Scatter(
         x = ufo_years,
         y = ufo_peryear,
         mode = 'lines',
         line = dict(
             color = 'rgb(0, 163, 81)',
             width = 3)
         )]

layout = go.Layout(
         title = 'UFO Reports by Year in United States (1910-2014)',
         xaxis = dict(
             rangeslider = dict(thickness = 0.05),
             showline = True,
             showgrid = False
         ),
         yaxis = dict(
             range = [0, 7000],
             showline = True,
             showgrid = False)
         )

figure = dict(data = trace, layout = layout)
iplot(figure)

In [None]:
# US specific from - https://www.kaggle.com/abigaillarion/ufo-reports-in-united-states
us_states = np.asarray(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
                        'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
                        'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
                        'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
                        'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY'])

# UFO sightings in United States only 
df_full['state/province'] = df_full['state/province'].str.upper() 
ufo_data = df_full[df_full['state/province'].isin(us_states)].sort_values('year')
ufo_data = ufo_data[(ufo_data.latitude > 15) & (ufo_data.longitude < -65)]
ufo_data = ufo_data[(ufo_data.latitude > 50) & (ufo_data.longitude > -125) == False]
ufo_data = ufo_data[ufo_data['city'].str.contains('\(Canada\)|\(Mexico\)') == False]
ufo_data = ufo_data.rename(columns={'state/province':'state'})

## Sightings by population
A little bit more from Abigail Larion's notebook 
[UFO Reports in United States](https://www.kaggle.com/abigaillarion/ufo-reports-in-united-states) showing the number of sighting by state in the US and by population.

In [None]:
# UFO sightings per state https://www.kaggle.com/abigaillarion/ufo-reports-in-united-states
ufo_perstate = np.asarray(ufo_data.groupby('state').state.count())

ufo_scale = [[0, 'rgb(229, 249, 239)'], [1, 'rgb(0, 163, 81)']]

data = [dict(
        type = 'choropleth',
        autocolorscale = False,
        colorscale = ufo_scale,
        showscale = False,
        locations = us_states,
        locationmode = 'USA-states',
        z = ufo_perstate,
        marker = dict(
            line = dict(
                color = 'rgb(255, 255, 255)',
                width = 2)
            )
        )]

layout = dict(
         title = 'UFO Reports by State in United States (1910-2014)',
         geo = dict(
             scope = 'usa',
             projection = dict(type = 'albers usa'),
             countrycolor = 'rgb(255, 255, 255)',
             showlakes = True,
             lakecolor = 'rgb(255, 255, 255)')
        )

figure = dict(data = data, layout = layout)
iplot(figure)

In [None]:
#  - https://www.kaggle.com/abigaillarion/ufo-reports-in-united-states
# state population estimates for July 2015 from US Census Bureau
state_population = np.asarray([738432, 4858979, 2978204, 6828065, 39144818, 5456574,
                               3590886, 672228, 945934, 20271272, 10214860, 1431603,
                               3123899, 1654930, 12859995, 6619680, 2911641, 4425092,
                               4670724, 6794422, 6006401, 1329328, 9922576, 5489594,
                               6083672, 2992333, 1032949, 10042802, 756927, 1896190,
                               1330608, 8958013, 2085109, 2890845, 19795791, 11613423,
                               3911338, 4028977, 12802503, 1056298, 4896146, 858469,
                               6600299, 27469114, 2995919, 8382993, 626042, 7170351,
                               5771337, 1844128, 586107])

# UFO sightings per 100,000 people in state
ufo_percapita = np.round(ufo_perstate / state_population * 100000, 2)

data = [dict(
        type = 'choropleth',
        autocolorscale = False,
        colorscale = ufo_scale,
        showscale = False,
        locations = us_states,
        locationmode = 'USA-states',
        z = ufo_percapita,
        marker = dict(
            line = dict(
                color = 'rgb(255, 255, 255)',
                width = 2)
            )
        )]

layout = dict(
         title = 'UFO Reports per 100,000 People in United States (1910-2014)',
         geo = dict(
             scope = 'usa',
             projection = dict(type = 'albers usa'),
             countrycolor = 'rgb(255, 255, 255)',
             showlakes = True,
             lakecolor = 'rgb(255, 255, 255)')
        )

figure = dict(data = data, layout = layout)
iplot(figure)

In [None]:
bins=np.linspace(min(df['length_of_encounter_seconds']),max(df['length_of_encounter_seconds']),5)
df['marker_color'] = pd.cut(df['length_of_encounter_seconds'], bins,  labels=['yellow', 'orange', 'red', 'black'])

In [None]:
def embed_map(m, file_name):
    from IPython.display import IFrame
    m.save(file_name)
    return IFrame(file_name, width='100%', height='500px')

## Heatmaps of the sightings
Note you can zoom out to see other parts of the world

In [None]:
# Create map with release incidents and monitoring stations
m_1 = folium.Map(location=[40,-97.941111], zoom_start=4)
HeatMap(data=df[['latitude', 'longitude']], radius=10).add_to(m_1)


# Show the map
m_1

## Heatmaps of the sightings - Long durations

In [None]:
# Long sighting - Anomalies
m_2 = folium.Map(location=[40,-97.941111], zoom_start=4)
HeatMap(data=df_anomaly[['latitude', 'longitude']], radius=10).add_to(m_2)


# Show the map
m_2

## Heatmaps of the sightings - Describes as light

In [None]:
# Long sighting - light
df_light = df[df['UFO_shape'] == 'light']
m_3 = folium.Map(location=[40,-97.941111], zoom_start=4)
HeatMap(data=df_light[['latitude', 'longitude']], radius=10).add_to(m_3)


# Show the map
m_3

## Heatmaps of the sightings - described as a disc

In [None]:
# Long sighting - disk
df_disk = df[df['UFO_shape'] == 'disk']
m_4 = folium.Map(location=[40,-97.941111], zoom_start=4)
HeatMap(data=df_disk[['latitude', 'longitude']], radius=10).add_to(m_4)


# Show the map
m_4

## Heatmaps of the sightings - described as a geometic object

In [None]:
# Long sighting - geometric
df_geometric = df[df['UFO_shape'] == 'geometric']
m_5 = folium.Map(location=[40,-97.941111], zoom_start=4)
HeatMap(data=df_geometric[['latitude', 'longitude']], radius=10).add_to(m_5)


# Show the map
m_5