In [130]:
# Import modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [166]:
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(
    Path('data/UFO_data.csv')   
)

# Review the DataFrame
df.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700.0,45 minutes,This event took place in early fall around 194...,4/27/2004,29.883056,-97.941111
1,10/10/1956 21:00,edna,tx,us,circle,20.0,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.978333,-96.645833
2,10/10/1960 20:00,kaneohe,hi,us,light,900.0,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.418056,-157.803611
3,10/10/1961 19:00,bristol,tn,us,sphere,300.0,5 minutes,My father is now 89 my brother 52 the girl wit...,4/27/2007,36.595,-82.188889
4,10/10/1965 23:45,norwalk,ct,us,disk,1200.0,20 minutes,A bright orange color changing to reddish colo...,10/2/1999,41.1175,-73.408333


In [167]:
# Unique shapes to determine which are circular
unique_shapes = df['shape'].unique()
unique_shapes

array(['cylinder', 'circle', 'light', 'sphere', 'disk', 'fireball',
       'unknown', 'oval', 'other', 'rectangle', 'chevron', 'formation',
       'triangle', 'cigar', 'changing', 'egg', 'diamond', 'flash',
       'teardrop', 'cone', 'cross', 'pyramid', 'delta', 'round', 'flare',
       'hexagon', 'changed'], dtype=object)

In [168]:
# Create a copy of the df and add column to indicate if shape is circular or not
df_learning = df.copy()
df_learning['circular'] = np.where(df_learning['shape'].isin(['cylinder', 'circle', 'sphere', 'disk', 'fireball',
       'oval', 'egg', 'round']), 1, 0)
df_learning.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude,circular
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700.0,45 minutes,This event took place in early fall around 194...,4/27/2004,29.883056,-97.941111,1
1,10/10/1956 21:00,edna,tx,us,circle,20.0,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.978333,-96.645833,1
2,10/10/1960 20:00,kaneohe,hi,us,light,900.0,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.418056,-157.803611,0
3,10/10/1961 19:00,bristol,tn,us,sphere,300.0,5 minutes,My father is now 89 my brother 52 the girl wit...,4/27/2007,36.595,-82.188889,1
4,10/10/1965 23:45,norwalk,ct,us,disk,1200.0,20 minutes,A bright orange color changing to reddish colo...,10/2/1999,41.1175,-73.408333,1


In [180]:
# Drop unnecessary columns
X = df_learning.drop(['datetime', 'city', 'shape', 'comments', 'duration (hours/min)'], axis=1)

In [181]:
# Convert date posted to values
X['date posted'] = pd.to_datetime(X['date posted'], format='%m/%d/%Y', errors='coerce')
X['date posted numeric'] = (X['date posted'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1D')
X = X.drop(['date posted'], axis=1)

In [182]:
# Preview the features data
X.head()

Unnamed: 0,state,country,duration (seconds),latitude,longitude,circular,date posted numeric
0,tx,us,2700.0,29.883056,-97.941111,1,12535
1,tx,us,20.0,28.978333,-96.645833,1,12434
2,hi,us,900.0,21.418056,-157.803611,0,12439
3,tn,us,300.0,36.595,-82.188889,1,13630
4,ct,us,1200.0,41.1175,-73.408333,1,10866


In [183]:
# Preview the first five entries for the target variable
y[:5]

0    1
1    1
2    0
3    1
4    1
Name: circular, dtype: int32

In [184]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

In [185]:
# Review the features data
X.head()

Unnamed: 0,duration (seconds),latitude,longitude,circular,date posted numeric,state_ab,state_ak,state_al,state_ar,state_az,...,state_wa,state_wi,state_wv,state_wy,state_yk,state_yt,country_au,country_ca,country_gb,country_us
0,2700.0,29.883056,-97.941111,1,12535,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,20.0,28.978333,-96.645833,1,12434,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,900.0,21.418056,-157.803611,0,12439,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,300.0,36.595,-82.188889,1,13630,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,1200.0,41.1175,-73.408333,1,10866,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [186]:
# Seperate the features, X,  from the target variable, y
y = X['circular']
X = X.drop(['circular'], axis=1)

In [187]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [188]:
X.dtypes

duration (seconds)     float64
latitude               float64
longitude              float64
date posted numeric      int64
state_ab                  bool
                        ...   
state_yt                  bool
country_au                bool
country_ca                bool
country_gb                bool
country_us                bool
Length: 75, dtype: object

In [189]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [202]:
# Import the KNeighborsClassifier module from sklearn
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=3)

In [203]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

In [204]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

In [205]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.67      0.65     10078
           1       0.40      0.36      0.38      6230

    accuracy                           0.55     16308
   macro avg       0.52      0.52      0.52     16308
weighted avg       0.54      0.55      0.55     16308

