<a href="https://colab.research.google.com/github/sam505/Machine_Learning/blob/master/UFO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import necessary python packages

In [None]:
# importing required packages
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

## Read the csv dataset into the notebook and visualize

In [None]:
# Reading the csv file using pandas' read_csv function that returns a dataframe object of the data in the csv
dataset_path = r"/content/drive/MyDrive/Data/county-data.csv" # The "r" ensures that the dataset file is accessed in read format
df = pd.read_csv(dataset_path) 

# Visualizes the first five and last five rows of the dataset
df 

Unnamed: 0,subregion,region,pop,income,ipaddr,ufo2010
0,abbeville,south carolina,25101,34670,30330,2
1,acadia,louisiana,61912,37970,38203,6
2,accomack,virginia,33341,41595,41338,2
3,ada,idaho,409061,55304,1035427,59
4,adair,iowa,7481,47623,3762,0
...,...,...,...,...,...,...
3067,yuma,arizona,200022,41441,99991,53
3068,yuma,colorado,10119,44991,7662,1
3069,zapata,texas,14290,26009,3833,0
3070,zavala,texas,11961,23952,545,1


## Dataset summary

In [None]:
# Describe function of the dataframe object of the dataset returns a statistical summary of the dataset columns
df.describe()

Unnamed: 0,pop,income,ipaddr,ufo2010
count,3072.0,3072.0,3072.0,3072.0
mean,101008.5,45074.703451,387973.1,7.943034
std,322604.2,11551.981523,4669809.0,28.75557
min,71.0,19344.0,0.0,0.0
25%,11214.75,37792.75,5367.0,0.0
50%,26047.0,43332.5,15289.0,2.0
75%,67921.0,50010.0,62594.0,6.0
max,9962789.0,120096.0,223441000.0,815.0


## Check for missing values

In [None]:
# The dataframe object's isna() function returns a False or True in case 
# a value is missing in any of the cells of a column. The sum() function in turn gives the total number of 
# Trues in each column. This values represent the number of all missing values in each column
df.isna().sum()

subregion    0
region       0
pop          0
income       0
ipaddr       0
ufo2010      0
dtype: int64

In [None]:
new_df = df.reset_index(drop=True) # resets the index column of the dataframe and then removes if if the drop parameter is set to True
new_df = new_df.set_index('region') # setting the index column of the dataframe to be region
new_df = new_df.sum(level=['region']) # declared the new dataframe to be sum of values of unique regions in the main df

## Create new column

In [None]:
# Creating a new column to house the values representing if that region experiences substantial UFO appearances. 
# 1 shows that the number of UFO appearances is substantial, that is, it is more thn 333 while a 0 indicates that the UFO appearances in that region are not substantial
# np.where() function takes in a conditional statement and applies it to the dataframe to add the appropriate value to the new column.
# The conditional statement is: if the value of ufo2010 is greater than 333, then the value of ufo_status in that particular row will be 1, otherwise the value will be a 0.
new_df['ufo_status'] = np.where(new_df['ufo2010'] > 333, 1, 0) 
new_df.head() # displays the first five rows of the new dataframe to visualize the new added column

Unnamed: 0_level_0,index,pop,income,ipaddr,ufo2010,ufo_status
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
south carolina,56255,4723723,1793990,5256663,336,1
louisiana,105855,4601893,2636560,4666033,229,0
virginia,148204,6776755,5394332,48947094,506,1
idaho,50114,1595728,1904783,2148449,161,0
iowa,143826,3074186,4730381,4821849,243,0


In [None]:
X = np.array([new_df["pop"], new_df["income"]]) # declaring the features array made up of the pop and income columns of the dataframe
Y = np.array(new_df["ufo_status"]) # declaring the labels column which is the ufo_status column
X = X.transpose(1, 0) # the transpose function of an array transforms the shape of an array. Initially the shape of the features array is 2, 49.
# The transpose function transforms it to 49, 2 to match the shape of the labels and make it possible to be split together.

print(X.shape, Y.shape)

(49, 2) (49,)


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3) 
# train_test_split function takes in an array or several arrays (features(X), labels(Y)) and splits them into the ratio specified by the test_size parameter

In [None]:
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(34, 2) (15, 2) (34,) (15,)


In [None]:
# This is the first model to be trained and tested on it's default parameters. It is used in cases where the prediction output is expected to be binary.
model1 = LogisticRegression()
results = model1.fit(X_train, Y_train)
results

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
Y_Predict = model1.predict(X_test) # predict function makes a prediction using the trained model

# The output of the prediction is compared with the Y_test labels to estimate how accurate the model performed on the test dataset 
(Y_test == Y_Predict).mean() * 100 

66.66666666666666

In [None]:
# The second model is the K-Nearest Neighbors Classifiers. The parameter passed to the model is the number of classes specified as 2.
# This is because the output is expected to be binary and it would be approrpiate to classify the values into two distinct classes.
model2 = KNeighborsClassifier(2)
model2.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                     weights='uniform')

In [None]:
# A prediction is made on the test_features using the model's predict function that takes in the features array and outputs a predictions array.
Y_Predict = model2.predict(X_test)

# The predictions array(Y_predict) is compared to the Y_test labels to estimate how accurate the model performed on the test dataset 
(Y_test == Y_Predict).mean() * 100

80.0

In [None]:
# The model to be trained and tested is the Decision Tree Classifier that takes the train dataset and attempts to come up with a 
# way to distinguish it into the number of unique labels in the Y_train array.
model3 = DecisionTreeClassifier()
model3.fit(X_train, Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
# A prediction is made on the test_features using the model's predict function that takes in the features array and outputs a predictions array.
Y_Predict = model3.predict(X_test)

# The predictions array(Y_predict) is compared to the Y_test labels to estimate how accurate the model performed on the test dataset 
(Y_test == Y_Predict).mean() * 100

73.33333333333333