In [1]:
# Import the required libraries and dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the CSV file into a Pandas DataFrame
df_UFOs = pd.read_csv(Path("data/UFO_data.csv"))

# Review the DataFrame
df_UFOs.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700.0,45 minutes,This event took place in early fall around 194...,4/27/2004,29.883056,-97.941111
1,10/10/1956 21:00,edna,tx,us,circle,20.0,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.978333,-96.645833
2,10/10/1960 20:00,kaneohe,hi,us,light,900.0,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.418056,-157.803611
3,10/10/1961 19:00,bristol,tn,us,sphere,300.0,5 minutes,My father is now 89 my brother 52 the girl wit...,4/27/2007,36.595,-82.188889
4,10/10/1965 23:45,norwalk,ct,us,disk,1200.0,20 minutes,A bright orange color changing to reddish colo...,10/2/1999,41.1175,-73.408333


In [3]:
df_UFOs.shape

(65229, 11)

In [4]:
# Convert 'datetime' to datetime format
df_UFOs['datetime'] = pd.to_datetime(df_UFOs['datetime'], errors='coerce')

# Drop rows with NaT values (invalid dates)
df_UFOs = df_UFOs.dropna(subset=['datetime'])

# Extract the year and month from the 'datetime' column
df_UFOs['year seen'] = df_UFOs['datetime'].dt.year
df_UFOs['month seen'] = df_UFOs['datetime'].dt.month

# Check the DataFrame to see the new column
df_UFOs.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude,year seen,month seen
0,1949-10-10 20:30:00,san marcos,tx,us,cylinder,2700.0,45 minutes,This event took place in early fall around 194...,4/27/2004,29.883056,-97.941111,1949,10
1,1956-10-10 21:00:00,edna,tx,us,circle,20.0,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.978333,-96.645833,1956,10
2,1960-10-10 20:00:00,kaneohe,hi,us,light,900.0,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.418056,-157.803611,1960,10
3,1961-10-10 19:00:00,bristol,tn,us,sphere,300.0,5 minutes,My father is now 89 my brother 52 the girl wit...,4/27/2007,36.595,-82.188889,1961,10
4,1965-10-10 23:45:00,norwalk,ct,us,disk,1200.0,20 minutes,A bright orange color changing to reddish colo...,10/2/1999,41.1175,-73.408333,1965,10


In [5]:
df_UFOs.dtypes

datetime                datetime64[ns]
city                            object
state                           object
country                         object
shape                           object
duration (seconds)             float64
duration (hours/min)            object
comments                        object
date posted                     object
latitude                       float64
longitude                      float64
year seen                        int32
month seen                       int32
dtype: object

In [6]:
df_UFOs.columns

Index(['datetime', 'city', 'state', 'country', 'shape', 'duration (seconds)',
       'duration (hours/min)', 'comments', 'date posted', 'latitude',
       'longitude', 'year seen', 'month seen'],
      dtype='object')

In [7]:
df_UFOs.drop(columns=['datetime', 'city', 'state', 'country', 'shape', 'duration (hours/min)', 'comments', 'date posted'], inplace=True)
df_UFOs.head()

Unnamed: 0,duration (seconds),latitude,longitude,year seen,month seen
0,2700.0,29.883056,-97.941111,1949,10
1,20.0,28.978333,-96.645833,1956,10
2,900.0,21.418056,-157.803611,1960,10
3,300.0,36.595,-82.188889,1961,10
4,1200.0,41.1175,-73.408333,1965,10


In [8]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

In [9]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the service_ratings DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(df_UFOs)
    inertia.append(k_model.inertia_)

In [10]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,87522390000.0
1,2,25173360000.0
2,3,10812340000.0
3,4,4611885000.0
4,5,2820810000.0


In [11]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [12]:
# Define the model with 4 clusters
model = KMeans(n_clusters=4, random_state=1)

# Fit the model
model.fit(df_UFOs)

# Make predictions
k_4 = model.predict(df_UFOs)

# Create a copy of the DataFrame
UFOs_predictions_df = df_UFOs.copy()

# Add a class column with the labels
UFOs_predictions_df['sighting_segment'] = k_4

In [13]:
# Plot the clusters
UFOs_predictions_df.hvplot.scatter(
    x="month seen",
    y="duration (seconds)",
    by="sighting_segment"
)

In [14]:
# Plot the clusters
UFOs_predictions_df.hvplot.scatter(
    x="year seen",
    y="duration (seconds)",
    by="sighting_segment"
)

In [15]:
# Plot the clusters
UFOs_predictions_df.hvplot.scatter(
    x="month seen",
    y="year seen",
    by="sighting_segment"
)

In [20]:
# Plot the clusters
plot = UFOs_predictions_df.hvplot.scatter(
    x="latitude",
    y="longitude",
    by="sighting_segment",
    title="UFO Sightings by Latitude and Longitude"
)

# Save as html in the 'images' folder
hvplot.save(plot, "images/UFO_lat_long_scatterplot.html")

plot