## Package Imports

In [1]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
%matplotlib inline
import seaborn as sns

import itertools
import re
import string
import pickle


from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.model_selection import KFold
from sklearn.preprocessing import  LabelEncoder

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter
#from nltk.tokenize import word_tokenize
#from nltk.stem import WordNetLemmatizer, SnowballStemmer
#from nltk.corpus import stopwords, wordnet 
#from wordcloud import WordCloud
from copy import deepcopy

from IPython.display import (
    Markdown as md,
    Latex,
    HTML,
)
from tqdm.auto import tqdm

# set plot style
sns.set()

## Data

### Load

In [2]:
df_train = pd.read_csv("train_set.csv")
df_test = pd.read_csv("test_set.csv")

### View

In [3]:
df_test

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.
...,...,...
5677,5678,You mark your ballot in private.
5678,5679,Ge o ka kgetha ka bowena go se šomiše Mofani k...
5679,5680,"E Ka kopo etsa kgetho ya hao ka hloko, hobane ..."
5680,5681,"TB ke bokudi ba PMB, mme Morero o tla lefella ..."


In [4]:
df_train

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...
...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...
32996,sot,modise mosadi na o ntse o sa utlwe hore thaban...
32997,eng,closing date for the submission of completed t...
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...


In [5]:
languages = list(df_train["lang_id"].unique())
languages

['xho', 'eng', 'nso', 'ven', 'tsn', 'nbl', 'zul', 'ssw', 'tso', 'sot', 'afr']

## Data Cleaning

In [6]:
# Create duplicate dataframe
df = pd.concat([df_train.copy(), df_test.copy()])

In [7]:
df

Unnamed: 0,lang_id,text,index
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,
2,eng,the province of kwazulu-natal department of tr...,
3,nso,o netefatša gore o ba file dilo ka moka tše le...,
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,
...,...,...,...
5677,,You mark your ballot in private.,5678.0
5678,,Ge o ka kgetha ka bowena go se šomiše Mofani k...,5679.0
5679,,"E Ka kopo etsa kgetho ya hao ka hloko, hobane ...",5680.0
5680,,"TB ke bokudi ba PMB, mme Morero o tla lefella ...",5681.0


In [8]:
df['cleaned_text'] = df['text'].str.lower()

In [9]:
# Function for removing url links
def remove_urls(text):
    # Using regular expressions for extracting web urls
    pattern_url = r"(htt[ps._-]*\:?[/.-_?]*(?:[A-Za-z0-9+.-_])*)"
    
    url_pattern = re.compile(pattern_url)
    return url_pattern.sub('', text)

In [10]:
# Applying the remove_urls function to our dataset
df['cleaned_text'] = df['text'].apply(remove_urls)

In [11]:
# replace \n and \t with " "
def remove_newlines(word):
    return word.replace(r"[\n]|[\t]", " ", regex = True)

In [12]:
df["cleaned_text"] = remove_newlines(df["cleaned_text"])

In [13]:
def remove_punctuation(text):
    alphabet = string.ascii_lowercase
    return ''.join([x for x in text if x in alphabet + " "])

In [14]:
df['cleaned_text'] = df['cleaned_text'].apply(remove_punctuation)

In [15]:
for text in df['cleaned_text'].iloc[25:30]:
    print(text)

kantike abantu namhlanje bangabhekana nengozi yokuthi ukufezeka kwalezi zinhloso kungahlehliswa ngeminyaka eminingi uma kungeyiwo amashumi eminyaka ngenxa yenhlekelele yezomnotho ekhungethe umhlaba jikelele
ukuhlelwa kwezehlakalo ezihlukeneko ukusuka ekhonferensini yesithathu yonyaka yethungelelwano leentjhabatjhaba lokwehluka ngamasiko incd ekapa ukuya ekwembathiseni ngehlonipho nemisebenzi yokubulungwa kweensalela zakasarah baartman
ingabe sewutfolile futsi wabambisa indzawo lekutawubanjelwa kuyo imihlangano yelikomidi leliwadi khumbula kutsi lendzawo ifanele ibe yindzawo yesive hhayi nje indzawo yangasense njengelikhaya lemuntfu loku ngulokumiswe ngumtsetfo
langa eli xesha nesizathu salo zingasinika amandla sonke ukuze sibumbane ngenkxaso kumbono wethu wokwenza intshona koloni ikhaya lethu sonke ngokuthi sikhumbule abahlelelekileyo abo bangenabantu nabo badingayo
batho ba bangata ba hlwaya diphoso ho boraditaba bao ka nako e nngwe ba hlahisang tseo re sa batleng ho di kgothaletsa ta

In [16]:
df

Unnamed: 0,lang_id,text,index,cleaned_text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,,umgaqosiseko wenza amalungiselelo kumaziko axh...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,,idha iya kuba nobulumko bokubeka umsebenzi nap...
2,eng,the province of kwazulu-natal department of tr...,,the province of kwazulunatal department of tra...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,,o netefata gore o ba file dilo ka moka te le d...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,,khomishini ya ndinganyiso ya mbeu yo ewa maana...
...,...,...,...,...
5677,,You mark your ballot in private.,5678.0,ou mark your ballot in private
5678,,Ge o ka kgetha ka bowena go se šomiše Mofani k...,5679.0,e o ka kgetha ka bowena go se omie ofani ka ti...
5679,,"E Ka kopo etsa kgetho ya hao ka hloko, hobane ...",5680.0,a kopo etsa kgetho ya hao ka hloko hobane ha ...
5680,,"TB ke bokudi ba PMB, mme Morero o tla lefella ...",5681.0,ke bokudi ba mme orero o tla lefella tlhahlo...


## Feature Extraction

In [17]:
df["tokens"] = df["cleaned_text"].str.split()

In [18]:
df_train_clean = df.iloc[:33000]
df_test_clean = df.iloc[33001:].drop(['lang_id'], axis=1)
df_test_clean

Unnamed: 0,text,index,cleaned_text,tokens
1,Uzakwaziswa ngokufaneleko nakungafuneka eminye...,2.0,zakwaziswa ngokufaneleko nakungafuneka eminye ...,"[zakwaziswa, ngokufaneleko, nakungafuneka, emi..."
2,Tshivhumbeo tshi fana na ngano dza vhathu.,3.0,shivhumbeo tshi fana na ngano dza vhathu,"[shivhumbeo, tshi, fana, na, ngano, dza, vhathu]"
3,Kube inja nelikati betingevakala kutsi titsini...,4.0,ube inja nelikati betingevakala kutsi titsini ...,"[ube, inja, nelikati, betingevakala, kutsi, ti..."
4,Winste op buitelandse valuta.,5.0,inste op buitelandse valuta,"[inste, op, buitelandse, valuta]"
5,"Ke feela dilense tše hlakilego, tša pono e tee...",6.0,e feela dilense te hlakilego ta pono e tee gob...,"[e, feela, dilense, te, hlakilego, ta, pono, e..."
...,...,...,...,...
5677,You mark your ballot in private.,5678.0,ou mark your ballot in private,"[ou, mark, your, ballot, in, private]"
5678,Ge o ka kgetha ka bowena go se šomiše Mofani k...,5679.0,e o ka kgetha ka bowena go se omie ofani ka ti...,"[e, o, ka, kgetha, ka, bowena, go, se, omie, o..."
5679,"E Ka kopo etsa kgetho ya hao ka hloko, hobane ...",5680.0,a kopo etsa kgetho ya hao ka hloko hobane ha ...,"[a, kopo, etsa, kgetho, ya, hao, ka, hloko, ho..."
5680,"TB ke bokudi ba PMB, mme Morero o tla lefella ...",5681.0,ke bokudi ba mme orero o tla lefella tlhahlo...,"[ke, bokudi, ba, mme, orero, o, tla, lefella, ..."


## Model Building

In [19]:
vectorizer = CountVectorizer(analyzer = 'word', min_df = 1, ngram_range = (1, 2))

### Split Data

In [23]:
X = vectorizer.fit_transform(df_train_clean["cleaned_text"])
y = df_train_clean['lang_id']

In [21]:
X_test_vect = vectorizer.transform(df_test_clean["cleaned_text"])
X_test_vect

<5681x713044 sparse matrix of type '<class 'numpy.int64'>'
	with 154648 stored elements in Compressed Sparse Row format>

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Build Model

In [25]:
model = MultinomialNB()

model.fit(X_train, y_train)

# Test the model on the testing data
y_pred = model.predict(X_test)

### Model Performance



In [23]:
# Evaluate the performance of the model
acc = accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average='macro')
f1_weighted = f1_score(y_test, y_pred, average='weighted')

# Create a dictionary of the evaluation metrics
performance = {'Accuracy': acc, 'F1 Macro': f1_macro, 'F1 Weighted': f1_weighted}

performance

{'Accuracy': 0.9989393939393939,
 'F1 Macro': 0.9989377604012364,
 'F1 Weighted': 0.9989393945723242}

### Cross-Validation



In [24]:
# Define a list of C values to test
C_values = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]

# Train a LinearSVC model with each value of C and compute the weighted F1 score
best_score = 0
best_C = 0

for C in C_values:
    clf = LinearSVC(C=C)
    #lr = LogisticRegression(C=C)

    scores = cross_val_score(clf, X, y, cv=100, scoring='f1_weighted')
    avg_score = sum(scores)/len(scores)
    print(f"C={C}, average weighted F1 score = {avg_score}")
    if avg_score > best_score:
        best_score = avg_score
        best_C = C

print(f"\nBest score: {best_score}, Best C: {best_C}")



KeyboardInterrupt: 

### Kaggle Submission

In [28]:
df_test

5681

In [30]:
# Making predictions on the test data set
submission = model.predict(X_test_vect)

# Appending the prediction results to the test set
output = pd.DataFrame(data={'index': df_test_clean['index'], 'lang_id': submission})

# Creating a csv file
output.to_csv('kaggle_csv/new_submission.csv', index = False)

