In [1]:
import pandas as pd
import numpy as np
import random as rd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
#load dataset
path = "/content/kaggle.csv"
df = pd.read_csv(path)

In [3]:
#if there is still empty cells, because of the categorical values, it will fill them with the mode of the collumn
for column in df.columns:
    if df[column].isnull().sum() > 0:
        moda = df[column].mode()[0]
        df[column].fillna(moda, inplace=True)

#count the number of empty cells
empty_cell = df.isnull().sum().sum()
print(f"Empty cells: {empty_cell}")

Empty cells: 0


In [4]:
#if there is both omnivore and any other type of diet for the same observation, then it should be changed to just omnivore
df['Diet'] = df['Diet'].replace({
    'carnivore, omnivore': 'omnivore',
    'herbivore, omnivore': 'omnivore',
})


#move the target column to the right
df = df[[col for col in df if col != 'Diet'] + ['Diet']]

In [5]:
#these features are deleted because they are irrelevant for this target.
columns_to_drop = ['Cc', 'Ref Author', 'Ref Pubyr']
df = df.drop(columns=columns_to_drop)

In [6]:
df

Unnamed: 0,Lat,Lng,What Dinosaurs Eat,Accepted Name,Country,Early Interval,Formation,Geological Interval,Geological Time Period,State,Max Ma,Min Ma,Diet
0,42.933300,123.966698,PLANT,Chaoyangsaurus youngi,China,Late Tithonian,Tuchengzi,Tithonian,Jurassic,Liaoning,150.8,132.90,herbivore
1,41.799999,120.733330,PLANT and ANIMAL,Protarchaeopteryx robusta,China,Late Barremian,Yixian,Barremian,Cretaceous,Liaoning,130.0,122.46,omnivore
2,41.799999,120.733330,PLANT and ANIMAL,Caudipteryx zoui,China,Late Barremian,Yixian,Barremian,Cretaceous,Liaoning,130.0,122.46,omnivore
3,50.740726,-111.528732,FLESH,Gorgosaurus libratus,Canada,Late Campanian,Dinosaur Park,Campanian,Cretaceous,Alberta,83.5,70.60,carnivore
4,50.737015,-111.549347,FLESH,Gorgosaurus libratus,Canada,Late Campanian,Dinosaur Park,Campanian,Cretaceous,Alberta,83.5,70.60,carnivore
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2458,49.183334,-98.099998,FISH,Hesperornis chowi,Canada,Early Campanian,Pierre Shale,Campanian,Cretaceous,Manitoba,83.5,70.60,piscivore
2459,49.183334,-98.099998,FISH,Hesperornis macdonaldi,Canada,Early Campanian,Pierre Shale,Campanian,Cretaceous,Manitoba,83.5,70.60,piscivore
2460,49.183334,-98.099998,FISH,Hesperornis macdonaldi,Canada,Early Campanian,Pierre Shale,Campanian,Cretaceous,Manitoba,83.5,70.60,piscivore
2461,49.183334,-98.099998,FISH,Hesperornis chowi,Canada,Early Campanian,Pierre Shale,Campanian,Cretaceous,Manitoba,83.5,70.60,piscivore


In [7]:
#count the number of observations of each class
diet_count = df['Diet'].value_counts()
diet_count

herbivore    1183
carnivore    1085
omnivore      158
piscivore      37
Name: Diet, dtype: int64

In [8]:
#encode the categorical columns, which means to change them to numerical values
label_encoders = {}
categorical_columns = ['What Dinosaurs Eat', 'Country', 'Early Interval', 'Formation', 'Geological Interval',
                       'Geological Time Period', 'State']

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [9]:
X = df.drop(columns=['Diet', 'Accepted Name', 'Country','Formation'])  #drop the unncessary features for the train dataset
dino_name = df['Accepted Name']  #the name is not necessary for the training but it is used for printing random predictions
y = df['Diet']  #define the label column, which is going to be diet

In [10]:
X  #training dataset

Unnamed: 0,Lat,Lng,What Dinosaurs Eat,Early Interval,Geological Interval,Geological Time Period,State,Max Ma,Min Ma
0,42.933300,123.966698,2,49,27,1,27,150.8,132.90
1,41.799999,120.733330,3,37,4,0,27,130.0,122.46
2,41.799999,120.733330,3,37,4,0,27,130.0,122.46
3,50.740726,-111.528732,1,40,8,0,2,83.5,70.60
4,50.737015,-111.549347,1,40,8,0,2,83.5,70.60
...,...,...,...,...,...,...,...,...,...
2458,49.183334,-98.099998,0,17,8,0,29,83.5,70.60
2459,49.183334,-98.099998,0,17,8,0,29,83.5,70.60
2460,49.183334,-98.099998,0,17,8,0,29,83.5,70.60
2461,49.183334,-98.099998,0,17,8,0,29,83.5,70.60


In [11]:
#used for scaling features for a better performance of the model
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
#split the data as 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [13]:
#train the Naive Bayes model using the library
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [14]:
#make predictions
y_pred = gnb.predict(X_test)

In [15]:
#calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)

print('Accuracy: ', accuracy)

Accuracy:  1.0


In [20]:
#select a random dinosaur and check if the prediction is correct
random_index = rd.choice(range(len(y_test)))

dinosaur_name = dino_name.iloc[random_index]
predicted_diet = y_pred[random_index]
actual_diet = y_test.iloc[random_index]

print(f"The {dinosaur_name} dinosaur was {actual_diet} and the model predicts it was {predicted_diet}.")

The Silvisaurus condrayi dinosaur was herbivore and the model predicts it was herbivore.
