## Importing modules

In [None]:
from gensim.models import Word2Vec
import pandas as pd
import kkltk

## Cleaning the dataset

### Kin train

In [None]:
import pandas as pd
from kkltk.kin_kir_stopwords import stopwords   # check https://github.com/Andrews2017/kkltk for more detailed information about how to use kkltk package

stopset_kin = stopwords.words('kinyarwanda') 

# loading the dataa
data = pd.read_csv('KINNEWS_train.csv')

# Cleaning the data (preprocessing)
# Removing the special characters and urls
data.title = data.title.str.replace('[^A-Za-z\s\’\-]+', '')
data.content = data.content.str.replace('[^A-Za-z\s\’\-]+', '')
data.title = data.title.str.replace('[\n]+', '')
data.content = data.content.str.replace('[\n]+', '')
data.title = data.title.str.replace('^https?:\/\/.*[\r\n]*', '')
data.content = data.content.str.replace('^https?:\/\/.*[\r\n]*', '')

# Removing the stopwords
data['title'] = data['title'].apply(lambda x: ' '.join([item.lower() for item in str(x).split() if item not in stopset_kin]))
data['content'] = data['content'].apply(lambda x: ' '.join([item.lower() for item in str(x).split() if item not in stopset_kin]))

# Filter out rows where labels are 8 or 10
data = data[~data['label'].isin([8, 10])]

# Print the cleaned data
print(data)

# Save the cleaned dataset
data.to_csv("cleaned/train.csv", index=False)

### Kin - test

In [None]:
import pandas as pd
from kkltk.kin_kir_stopwords import stopwords   # check https://github.com/Andrews2017/kkltk for more detailed information about how to use kkltk package

stopset_kin = stopwords.words('kinyarwanda') 

# loading the data
data = pd.read_csv('KINNEWS_test.csv')

# Cleaning the data (preprocessing)
# Removing the special characters and urls
data.title = data.title.str.replace('[^A-Za-z\s\’\-]+', '')
data.content = data.content.str.replace('[^A-Za-z\s\’\-]+', '')
data.title = data.title.str.replace('[\n]+', '')
data.content = data.content.str.replace('[\n]+', '')
data.title = data.title.str.replace('^https?:\/\/.*[\r\n]*', '')
data.content = data.content.str.replace('^https?:\/\/.*[\r\n]*', '')

# Removing the stopwords
data['title'] = data['title'].apply(lambda x: ' '.join([item.lower() for item in str(x).split() if item not in stopset_kin]))
data['content'] = data['content'].apply(lambda x: ' '.join([item.lower() for item in str(x).split() if item not in stopset_kin]))

# Filter out rows where labels are 8 or 10
data = data[~data['label'].isin([8, 10])]

# Print the cleaned data
print(data)

# Save the cleaned dataset
data.to_csv("cleaned/test.csv", index=False)

## Embeddings

In [None]:
from gensim.models import Word2Vec
import pandas as pd

# load the data
data_train = pd.read_csv('cleaned/train.csv')
data_test = pd.read_csv('cleaned/test.csv')
data = pd.concat([data_train, data_test])
data['whole_doc'] = data['title'] + ' ' + data['content'].astype(str)

# clean the data (preprocessing)
data.whole_doc = data.whole_doc.str.replace('[^A-Za-z\s\’\-]+', '')
data.whole_doc = data.whole_doc.str.replace('[\n]+', '')
data.whole_doc = data.whole_doc.str.replace('^https?:\/\/.*[\r\n]*', '')

# Create the list of list format of the custom corpus for gensim modeling
sent = [row.split(' ') for row in data['whole_doc'] if len(row)]
sent = [[tok.lower() for tok in sub_sent if len(tok) != 0] for sub_sent in sent]

# Training the model
w2v_model = Word2Vec(sent, window=5, min_count=5, sg=1, hs=1, vector_size=50)

# Generate a list of words with their vectors to make the custom embeddings generation possible
w2v_vectors = []
for token, idx in w2v_model.wv.key_to_index.items():
    str_vec = ''
    if token in w2v_model.wv.key_to_index.keys():
        str_vec += token
        for i in range(len(w2v_model.wv[token])):
            str_vec += ' ' + str(w2v_model.wv[token][i])
    w2v_vectors.append(str_vec)

# Save the above embeddings list in txt file
with open("W2V-Kin-50.txt", 'w') as output:
    for row in w2v_vectors:
        output.write(str(row) + '\n')

In [None]:
from gensim.models import Word2Vec
import pandas as pd

# load the data
data_train = pd.read_csv('cleaned/kir_train.csv')
data_test = pd.read_csv('cleaned/kir_test.csv')
data = pd.concat([data_train, data_test])
data['whole_doc'] = data['title'] + ' ' + data['content'].astype(str)

# clean the data (preprocessing)
data.whole_doc = data.whole_doc.str.replace('[^A-Za-z\s\’\-]+', '')
data.whole_doc = data.whole_doc.str.replace('[\n]+', '')
data.whole_doc = data.whole_doc.str.replace('^https?:\/\/.*[\r\n]*', '')

# Create the list of list format of the custom corpus for gensim modeling
sent = [row.split(' ') for row in data['whole_doc'] if len(row)]
sent = [[tok.lower() for tok in sub_sent if len(tok) != 0] for sub_sent in sent]

# Training the model
w2v_model = Word2Vec(sent, window=5, min_count=5, sg=1, hs=1, vector_size=50)

# Generate a list of words with their vectors to make the custom embeddings generation possible
w2v_vectors = []
for token, idx in w2v_model.wv.key_to_index.items():
    str_vec = ''
    if token in w2v_model.wv.key_to_index.keys():
        str_vec += token
        for i in range(len(w2v_model.wv[token])):
            str_vec += ' ' + str(w2v_model.wv[token][i])
    w2v_vectors.append(str_vec)

# Save the above embeddings list in txt file
with open("W2V-Kir-50.txt", 'w') as output:
    for row in w2v_vectors:
        output.write(str(row) + '\n')