In [16]:
# Dependencies.

import pandas as pd
import numpy as np

# Language detection and translation libraries.

from deep_translator import GoogleTranslator # Google translate API accessor.

# Language Processing
import nltk
from nltk.tokenize import word_tokenize # Tokenizing the message
from nltk.stem import WordNetLemmatizer # Lemmatization of the message

To-Do:

1. Aniket pointed out that a lot of the words are in latin. `deep_translator` is a library for language translation. `GoogleTranslator` is an object that uses Google Translate's API for translation. We want to convert the latin words into english.

2. A lot of the words used are in english, except with an 'x' or a 'z' at the ending. Removing this suffix would make the word closer to english, while also removing a garbage character that doesn't add any context/meaning to the word.

In [17]:
df = pd.read_csv("data.csv")

df

Unnamed: 0,message,fingers,tail,species
0,pluvia arbor aquos,4,no,Aquari
1,cosmix xeno nebuz odbitaz,5,yes,Zorblax
2,solarix glixx novum galaxum quasar,5,yes,Zorblax
3,arbor insectus pesros ekos dootix nimbus,2,yes,Florian
4,mermax drakos lorix epikoz deftax,4,no,Faerix
...,...,...,...,...
495,empathix sadix disgux dredax pridius afgstix e...,2,no,Emotivor
496,quasar ustron nebulax meteorn,4,no,Quixnar
497,astron xeno ceaestar astron kometa,6,yes,Zorblax
498,sporzom nimbus terram terranix aviana ekos nimbub,2,yes,Florian


In [18]:
# Removes 'x' and 'z' from the ending.

def cleanEnding(message):

    cleaned_message = []
    message = message.split()
    
    for word in message:
        if word[-1] in ['x','z']:
            word = word[:-1]
        cleaned_message.append(word)
    return ' '.join(cleaned_message)

# Convert latin words to english.

def englishConversion(message):

    translator = GoogleTranslator(source="latin",target="english") # Initialize the translator.

    translatedMessage = []
    message = message.split(' ')

    for word in message:
        translatedWord = translator.translate(word)
        translatedMessage.append(translatedWord)
    
    return ' '.join(translatedMessage)

def processing(message):

    message = englishConversion(cleanEnding(message))

    lemmatizer = WordNetLemmatizer() # Lemmatizing, now that it is similar to english.
    tokens = word_tokenize(message) # Tokenization, so the model can understand better.

    lemmatizations = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatizations

In [19]:
df['preprocessed_message'] = df['message'].apply(processing) # Processing the data

In [20]:
# Cleaning the data further

def join(message):
    return ' '.join(word for word in message)

from nltk.corpus import stopwords # Stopwords, i.e. garbage words that provide no extra context/meaning to the model.

nltk.download('stopwords')
stopwords_list = stopwords.words('english')

additional_stopwords = ['i','a'] # Words I found in the data that are useless. You guys can add more if you want.
stopwords_list.extend(additional_stopwords)

def remove_stopwords(message):

    res = []

    for word in message:
        if word not in stopwords_list:
            res.append(word.lower())
    
    return ' '.join(res)

df['processedMessage'] = df['preprocessed_message'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\suhan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Now, we need to handle the multimodal distribution of the number of fingers of the "Faerix" and "Emotivor" species. Through observing the KDE graph, we can see that the slope is 0 for Faerix when x = 3.5, and for Emotivor when x = 1.5. So let's separate the datapoints.

In [24]:
def split_species(row):
    if row["species"] == "Faerix":
        if row["fingers"] <= 3.5:
            return "Faerix_Group1"
        else:
            return "Faerix_Group2"
    elif row["species"] == "Emotivor":
        if row["fingers"] <= 1.5:
            return "Emotivor_Group1"
        else:
            return "Emotivor_Group2"
    return row["species"]

df["species_group"] = df.apply(split_species, axis=1)

df['species_group'].unique()

array(['Aquari', 'Zorblax', 'Florian', 'Faerix_Group2', 'Nexoon',
       'Mythron', 'Emotivor_Group2', 'Sentire', 'Quixnar', 'Cybex',
       'Emotivor_Group1', 'Faerix_Group1'], dtype=object)

In [22]:
df.to_csv("processedData.csv") # Storing the data.

Thoughts :

1. I am running the entire message through the Latin-English translator. If the translator sees any latin words, it converts it, and leaves anything it doesn't recognize. However this isn't always true, as sometimes it might make an error and translate a non-latin word, which usually returns the same word but fucked up by a little. Example : "pollex" -> "pull". All the language detection libraries I've seen aren't very reliable, so this is something we need to make allowances for. Think up ideas to get past this, or language detection frameworks we can use. Build a latin/Non latin classification model might work.

2. I added 'i' and 'a' to the stopwords list by looking through the processed data and seeing what stood out. If you have time, do the same, it would help a lot.