# The X-Files: UFO Sightings EDA and Interactive Maps

Notebook takes a few seconds to load completely, so please be patient...

# Table of Contents
* [Data preparation](#1)
* [Explore time features](#2)
* [Explore categorical features](#3)
* [Wordclouds](#4)
* [Geospatial Visualizations](#5)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plot
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

# map
import folium
from folium.plugins import HeatMap

# NLP
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

<a id='1'></a>
# Data preparation

In [None]:
# import csv file
df = pd.read_csv('../input/ufo-sightings-1969-to-2019/nuforc_reports.csv')
# first glance
df.head()

In [None]:
# dimensions of table
n_row = df.shape[0]
print('Number of observations:', n_row)

In [None]:
# show all column names
print(list(df.columns))

In [None]:
# convert dates
df.date_time = pd.to_datetime(df.date_time)
df.posted = pd.to_datetime(df.posted)

# add year and months
df['year'] = df.date_time.dt.year
df['month'] = df.date_time.dt.month

In [None]:
# let's see what's in the stats column
df.stats[0]


Ok, this seems to be somewhat redundant. Only additional info would be the reported date. 

In [None]:
# let's check another row, e. g. the row 2, where we have quite a few NaNs:
df.stats[2]

In [None]:
df.iloc[2,:]

Well, there seems to be some info that is not transfered to the individual features, e. g. location is in the stats, but not in the corresponding column. Nevertheless, for the sake of simplicity, we will ignore the stats column in the following.

<a id='2'></a>
# Explore time features

In [None]:
# Year
print('Year - Missing: ',np.round( 100*sum(df.year.isna())/n_row,2 ), '%')
for_count = df.year[~df.year.isna()] # remove missings for count
plt.figure(figsize=(14,5))
for_count.astype(int).value_counts().plot(kind='bar')
plt.title('Year - Sorted by Frequency')
plt.grid()
plt.show()

In [None]:
# Year - plot in natural order
plt.figure(figsize=(12,5))
for_count.astype(int).value_counts().sort_index().plot(kind='bar')
plt.title('Year')
plt.grid()
plt.show()

Statistics before 2006 are quite sparse...

In [None]:
# Month
print('Month - Missing: ',np.round( 100*sum(df.month.isna())/n_row,2 ), '%')
for_count = df.month[~df.month.isna()] # remove missings for count
plt.figure(figsize=(12,5))
for_count.astype(int).value_counts().plot(kind='bar')
plt.title('Month - Sorted by Frequency')
plt.grid()
plt.show()

In [None]:
# Month - plot in natural order
plt.figure(figsize=(12,5))
for_count.astype(int).value_counts().sort_index().plot(kind='bar')
plt.title('Month')
plt.grid()
plt.show()

We see more sightings in the warmer months. Not so surprising.

In [None]:
# impute missing values
df.year = df.year.fillna(0).astype(int)
df.month = df.month.fillna(0).astype(int)

In [None]:
# add year+month combo
df['year_month'] = df.year.astype(str) + '_' + df.month.astype(str)

In [None]:
plt.figure(figsize=(12,5))
df.year_month.value_counts()[0:20].plot(kind='bar')
plt.title('Most frequent year/month combinations (0_0 ~ missing values)')
plt.grid()
plt.show()

July 2014 was the month with the most observations.

<a id='3'></a>
# Explore categorical features

In [None]:
features_cat = ['city', 'state', 'shape', 'duration']

In [None]:
for f in features_cat:
    plt.figure(figsize=(12,5))
    df[f].value_counts()[0:20].plot(kind='bar')
    plt.title(f)
    plt.grid()
    plt.show()

<a id='4'></a>
# Wordclouds

### Wordcloud for "summary" column

In [None]:
my_text = df.summary[~df.summary.isna()]
# example
my_text[0]

In [None]:
stopwords = set(STOPWORDS)

# add more context specific stopwords
# stopwords.update({'www','href','http','https'})

# render wordcloud
text = " ".join(txt for txt in my_text)

t1 = time.time()
wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=500,
                      width = 600, height = 400,
                      background_color="white").generate(text)
t2 = time.time()

plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

print('Elapsed time: ', np.round(t2-t1,2), 's')

### Wordcloud for "text" column

In [None]:
my_text = df.text[~df.text.isna()]
# example
my_text[0]

In [None]:
stopwords = set(STOPWORDS)

# add more context specific stopwords
# stopwords.update({'www','href','http','https'})

# render wordcloud
text = " ".join(txt for txt in my_text)

t1 = time.time()
wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=500,
                      width = 600, height = 400,
                      background_color="white").generate(text)
t2 = time.time()

plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

print('Elapsed time: ', np.round(t2-t1,2), 's')

<a id='5'></a>
# Geospatial Visualizations

In [None]:
# check coordinates first
plt.scatter(df.city_longitude, df.city_latitude)
plt.grid()
plt.show()

Looks reasonable, especially we do not have any (0,0) locations (which typically represent missing values).

However, there are a few locations clearly outside of the US. Let's check:

In [None]:
df_strange = df[(df.city_latitude < 15) | (df.city_longitude > -40)]
df_strange

The first seven rows seem to have a wrong geocoding (you can check the details by following the report link). Only the last one, in Australia, seems to be actually correct, the website provides the following details:

Sighting Report
* Occurred : 12/21/2018 02:20 (Entered as : 21/12/2018 02:20)
* Reported: 12/20/2018 2:17:12 PM 14:17
* Posted: 1/4/2019
* Location: Perth (Western Australia), WA
* Shape: Changing
* Duration: 5.12 minutes

In [None]:
df_strange.iloc[7,].stats

In [None]:
# let's remove the strange entries for the following
# (alternatively we could fix the coordinates manually)
list_strange = list(df_strange.report_link)[0:7] # we keep the last one (Australia)!
df = df[~df.report_link.isin(list_strange)]

In [None]:
# missings
miss_lat = sum(df.city_latitude.isna())
miss_lon = sum(df.city_longitude.isna())

print('Missing Latitude:  ', miss_lat, ' ~ ', round(100*miss_lat/n_row,2), '%')
print('Missing Longitude: ', miss_lon, ' ~ ', round(100*miss_lon/n_row,2), '%')

In [None]:
# for plot we need to remove the rows with missing coordinates
df_plot = df[~df.city_latitude.isna()]
df_plot.shape

### Heatmap

In [None]:
# heatmap of locations for first overview
zoom_factor = 2 # inital map size
my_map_1 = folium.Map(location=[0,0], zoom_start=zoom_factor)
HeatMap(data=df_plot[['city_latitude', 'city_longitude']], radius=10).add_to(my_map_1)
my_map_1 # display

### Individual locations (year 2019)

For an interactive detailed display of the locations let's first select **year 2019 only** so we do not have too many points to display.

In [None]:
df_plot_2019 = df_plot[df_plot.year==2019]
df_plot_2019.shape

In [None]:
# interactive map of individual locations (year 2019)
zoom_factor = 5
my_map_2 = folium.Map(location=[36,-100], zoom_start=zoom_factor)

for i in range(0,df_plot_2019.shape[0]):
   folium.Circle(
      location=[df_plot_2019.iloc[i]['city_latitude'], df_plot_2019.iloc[i]['city_longitude']],
      radius=10000,
      color='red',
      weight=1, # stroke width in pixels
      popup='Report Link: ' + df_plot_2019.iloc[i]['report_link'] + ' - Time: ' + str(df_plot_2019.iloc[i]['date_time']),
      fill=True,
      fill_color='red'
   ).add_to(my_map_2)
my_map_2 # display

### Individual locations + color encoding of month (year 2019)

In [None]:
# interactive map of individual locations (year 2019)
zoom_factor = 5
my_map_3 = folium.Map(location=[36,-100], zoom_start=zoom_factor)

v_min = np.min(df_plot_2019.month)
v_max = np.max(df_plot_2019.month)
    
for i in range(0,df_plot_2019.shape[0]):
        v = df_plot_2019.iloc[i]['month']
        v_norm = (v-v_min)/(v_max-v_min)
        i_col = int(v_norm*256)
        current_color = matplotlib.colors.to_hex(matplotlib.cm.rainbow(i_col))
        folium.Circle(
            location=[df_plot_2019.iloc[i]['city_latitude'], df_plot_2019.iloc[i]['city_longitude']],
            radius=10000,
            color='black',
            weight=1, # stroke width in pixels
            popup='Report Link: ' + df_plot_2019.iloc[i]['report_link'] + ' - Time: ' + str(df_plot_2019.iloc[i]['date_time']) + ' - Month: ' + str(df_plot_2019.iloc[i]['month']),
            fill=True,
            fill_opacity=0.25,
            fill_color=current_color
        ).add_to(my_map_3)
my_map_3 # display

### Individual locations + color encoding of shape (year 2019)

In [None]:
# interactive map of individual locations (year 2019)
zoom_factor = 5
my_map_4 = folium.Map(location=[36,-100], zoom_start=zoom_factor)

# creatue numeric version of shape column
shape_as_number = df_plot_2019['shape'].astype('category').cat.codes

v_min = np.min(shape_as_number) + 1 # min is -1 => add one
v_max = np.max(shape_as_number) + 1

for i in range(0,df_plot_2019.shape[0]):
        v = shape_as_number.iloc[i] + 1 # min is -1 => add one
        v_norm = (v-v_min)/(v_max-v_min)
        i_col = int(v_norm*256)
        current_color = matplotlib.colors.to_hex(matplotlib.cm.rainbow(i_col))
        folium.Circle(
            location=[df_plot_2019.iloc[i]['city_latitude'], df_plot_2019.iloc[i]['city_longitude']],
            radius=10000,
            color='black',
            weight=1, # stroke width in pixels
            popup='Report Link: ' + df_plot_2019.iloc[i]['report_link'] + ' - Time: ' + str(df_plot_2019.iloc[i]['date_time']) + ' - Shape: ' + str(df_plot_2019.iloc[i]['shape']),
            fill=True,
            fill_opacity=0.25,
            fill_color=current_color
        ).add_to(my_map_4)
my_map_4 # display

#### There does not seem to be a real pattern... shapes seem to occur more or the less randomly?