In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pandas as pd

In [2]:
# Load data from a CSV file
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

# Display the first few rows of the data
print(train_data["labels"])

0        pt
1        bg
2        zh
3        th
4        ru
         ..
69995    ja
69996    el
69997    ur
69998    es
69999    hi
Name: labels, Length: 70000, dtype: object


In [3]:
abreviations_to_full_label_names = {
    "nl": "Dutch",
    "es": "Spanish",
    "it": "Italian",
    "ar": "Arabic",
    "ru": "Russian",
    "tr": "Turkish",
    "bg": "Bulgarian",
    "de": "German",
    "el": "Greek",
    "en": "English",
    "fr": "French",
    "hi": "Hindi",
    "ja": "Japanese",
    "pl": "Polish",
    "pt": "Portuguese",
    "sw": "Swahili",
    "th": "Thai",
    "ur": "Urdu",
    "vi": "Vietnamese",
    "zh": "Chinese"
}
test_data["labels"][10]

'it'

In [4]:
full_labels = []
for label_abr in test_data["labels"]:
    full_labels.append(abreviations_to_full_label_names[label_abr])
test_data["labels"] = full_labels
print(test_data)

          labels                                               text
0          Dutch                    Een man zingt en speelt gitaar.
1          Dutch  De technologisch geplaatste Nasdaq Composite I...
2        Spanish  Es muy resistente la parte trasera rígida y lo...
3        Italian  "In tanti modi diversi, l'abilità artistica de...
4         Arabic  منحدر يواجه العديد من النقاشات المتجهه إزاء ال...
...          ...                                                ...
9995     Chinese                               史料很充分，对岸的很多观点与大陆迥异啊。
9996     Turkish  Örneğin, teşhis Yunanca bir kelimeden alındı (...
9997  Vietnamese  Nếu lite/light chỉ đơn giản là mô tả một đặc t...
9998   Bulgarian  Например, една щатска столица, която посетихме...
9999      Polish                   Mam dla ciebie kilka propozycji:

[10000 rows x 2 columns]


In [5]:
full_labels = []
for label_abr in train_data["labels"]:
    full_labels.append(abreviations_to_full_label_names[label_abr])
train_data["labels"] = full_labels
print(train_data)

           labels                                               text
0      Portuguese  os chefes de defesa da estónia, letónia, lituâ...
1       Bulgarian  размерът на хоризонталната мрежа може да бъде ...
2         Chinese  很好，以前从不去评价，不知道浪费了多少积分，现在知道积分可以换钱，就要好好评价了，后来我就把...
3            Thai  สำหรับ ของเก่า ที่ จริงจัง ลอง   honeychurch  ...
4         Russian                             Он увеличил давление .
...           ...                                                ...
69995    Japanese  本格的なゲーミングヘッドホンでした。 今まで使ってた1万円するパナソニックのヘッドホンは何だ...
69996       Greek  Ναι , ξέρω ένα που είναι ακόμα έτσι , αλλά αυτ...
69997        Urdu  اور مجھے اس ملک کے بارے میں معلوم نہیں ہے کہ گ...
69998     Spanish  Se me rompió uno al sacarlo del cargador. Cali...
69999       Hindi  कोसोवो कथा का विवरण जिसमें स ् थानीय राष ् ट ्...

[70000 rows x 2 columns]


In [6]:
X_train = train_data["text"]
y_train = train_data["labels"]

X_test = test_data["text"]
y_test = test_data["labels"]

print(y_train)

0        Portuguese
1         Bulgarian
2           Chinese
3              Thai
4           Russian
            ...    
69995      Japanese
69996         Greek
69997          Urdu
69998       Spanish
69999         Hindi
Name: labels, Length: 70000, dtype: object


In [7]:
from sklearn.feature_extraction.text import CountVectorizer


In [8]:
vectorizer = CountVectorizer() #Found out about count vectorizer through ChatGPT. Count vectorizer converts text into a series of vectors that can be processed
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

In [9]:
classifier = MultinomialNB()

In [10]:
classifier = classifier.fit(X_train_counts, y_train)


In [11]:
y_pred = classifier.predict(X_test_counts)


In [12]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9214


In [13]:
def predict_language(sample_text):
    input_text_counts = vectorizer.transform([sample_text])
    predicted_language = classifier.predict(input_text_counts)
    return predicted_language[0]

In [14]:
text_sample = input("Please enter a word/phrase")

text_sample_predict = predict_language(text_sample)
print(text_sample_predict)

Spanish
