# Sentiment Analysis of Hausa, Igbo, and Yoruba Languages

In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [30]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
import re


import tensorflow as tf
from tqdm import tqdm
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import LSTM, GRU,SimpleRNN
from keras.layers import Dense, Activation, Dropout
from keras.layers import Embedding
from keras.layers import BatchNormalization
#  from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


In [3]:
# Loading the datasets, stopwords, and lexicons
languages = ['hausa', 'igbo', 'yoruba']
train_data = {}
dev_data = {}
test_data = {}
stopwords = {}
lexicons = {}

for lang in languages:
    train_data[lang] = pd.read_csv(f'https://raw.githubusercontent.com/sa-diq/sentiment-analysis-Hau-Ibo-Yor-/main/data/{lang}_train.tsv', delimiter='\t')
    dev_data[lang] = pd.read_csv(f'https://raw.githubusercontent.com/sa-diq/sentiment-analysis-Hau-Ibo-Yor-/main/data/{lang}_dev.tsv', delimiter='\t')
    test_data[lang] = pd.read_csv(f'https://raw.githubusercontent.com/sa-diq/sentiment-analysis-Hau-Ibo-Yor-/main/data/{lang}_test.tsv', delimiter='\t')
    stopwords[lang] = pd.read_csv(f'https://raw.githubusercontent.com/sa-diq/sentiment-analysis-Hau-Ibo-Yor-/main/data/{lang}_stopwords.csv')
    lexicons[lang] = pd.read_csv(f'https://raw.githubusercontent.com/sa-diq/sentiment-analysis-Hau-Ibo-Yor-/main/data/{lang}_lexicon.csv')

In [4]:
# Print the first 5 rows of the hausa train data, yoruba train data, and igbo train data and their shapes
print('Hausa Train Data')
print(train_data['hausa'].head())
print(train_data['hausa'].shape)
print('\n')
print('Igbo Train Data')
print(train_data['igbo'].head())
print(train_data['igbo'].shape)
print('\n')
print('Yoruba Train Data')
print(train_data['yoruba'].head())
print(train_data['yoruba'].shape)
print('\n')

Hausa Train Data
                                               tweet     label
0  @user Da kudin da Arewa babu wani abin azo aga...  negative
1  @user Kaga wani Adu ar Banda💔😭 wai a haka Shi ...  negative
2  @user Sai haquri fa yan madrid daman kunce cha...  negative
3  @user Hmmm yanzu kai kasan girman allah daxaka...  negative
4  @user @user Wai gwamno nin Nigeria suna afa kw...  negative
(14172, 2)


Igbo Train Data
                                               tweet     label
0       Nna Ike Gwuru ooo. 😂 https://t.co/NDS7juFBGd  negative
1                 @user Chineke nna kezi mgbe ole???  negative
2  Lol. Isi adirokwanu gi nma.. 😐😒😒😒 https://t.co...  negative
3  @user haha. Fulani herdsmen. akpa amu gi retwe...  negative
4  Nna ghetto di gi na aru biko!!! https://t.co/4...  negative
(10192, 2)


Yoruba Train Data
                                               tweet     label
0  Ìwọ ikú òpònú abaradúdú wọ, o ò ṣe é 're o. O ...  negative
1  Yorùbá nbú'yàn ṣá """"""""..àyà wanle 

# DATA PREPROCESSING

In [5]:
# create a function to clean the text data
def clean_text_data(data):
    if 'text' in data.columns:
        data['text'] = data['text'].apply(clean_text)
    return data
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) #Removing @mentions
    text = re.sub(r'#', '', text) # Removing '#' hash tag
    text = re.sub(r'RT[\s]+', '', text) # Removing RT
    text = re.sub(r'https?:\/\/\S+', '', text) # Removing the hyper link
    text = re.sub(r'[^\w\s]', '', text) # Removing the punctuation
    text = re.sub(r'\d+', '', text) # Removing the digits
    text = text.lower() # Converting the text to lower case
    return text

#  apply the clean_text function to the text data
for lang in languages:
    train_data[lang]['tweet'] = train_data[lang]['tweet'].apply(clean_text)
    dev_data[lang]['tweet'] = dev_data[lang]['tweet'].apply(clean_text)
    test_data[lang]['tweet'] = test_data[lang]['tweet'].apply(clean_text)

In [6]:
# Print the first 5 rows of the hausa train data
print('Hausa Train Data')
print(train_data['hausa'].head())

Hausa Train Data
                                               tweet     label
0   da kudin da arewa babu wani abin azo agani da...  negative
1   kaga wani adu ar banda wai a haka shi ne shug...  negative
2   sai haquri fa yan madrid daman kunce champion...  negative
3   hmmm yanzu kai kasan girman allah daxakace mu...  negative
4        wai gwamno nin nigeria suna afa kwayoyi ko   negative


In [7]:
# Remove the stopwords from the train, test, dev data for the three languages
def remove_stopwords(data, stopwords):
    data['tweet'] = data['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
    return data

for lang in languages:
    train_data[lang] = remove_stopwords(train_data[lang], stopwords[lang])
    dev_data[lang] = remove_stopwords(dev_data[lang], stopwords[lang])
    test_data[lang] = remove_stopwords(test_data[lang], stopwords[lang])

# Print the first 5 rows of the hausa train data
print('Yoruba Train Data')
print(train_data['hausa'].head())



Yoruba Train Data
                                               tweet     label
0  da kudin da arewa babu wani abin azo agani da ...  negative
1  kaga wani adu ar banda wai a haka shi ne shuga...  negative
2  sai haquri fa yan madrid daman kunce champion ...  negative
3  hmmm yanzu kai kasan girman allah daxakace muk...  negative
4         wai gwamno nin nigeria suna afa kwayoyi ko  negative


In [8]:
# Create a function to calculate the sentiment score
def calculate_sentiment_score(text, lexicon_dict):
    score = 0
    words = text.split()
    for word in words:
        score += lexicon_dict.get(word, 0)
    return score

def process_data(df, lexicon):
    lexicon_dict = lexicon.set_index('human')['label'].map({'positive': 1, 'negative': -1}).to_dict()
    df["sentiment_score"] = df["tweet"].apply(calculate_sentiment_score, args=(lexicon_dict,))
    return df

In [9]:
# Apply the process_data function to the train, test, dev data for the three languages   
for lang in languages:
    train_data[lang] = process_data(train_data[lang], lexicons[lang])
    dev_data[lang] = process_data(dev_data[lang], lexicons[lang])
    test_data[lang] = process_data(test_data[lang], lexicons[lang])

In [10]:
# print the first 5 rows of the hausa train data
print('Hausa Train Data')
train_data['hausa'].head()

Hausa Train Data


Unnamed: 0,tweet,label,sentiment_score
0,da kudin da arewa babu wani abin azo agani da ...,negative,-1
1,kaga wani adu ar banda wai a haka shi ne shuga...,negative,1
2,sai haquri fa yan madrid daman kunce champion ...,negative,1
3,hmmm yanzu kai kasan girman allah daxakace muk...,negative,-5
4,wai gwamno nin nigeria suna afa kwayoyi ko,negative,-1


In [11]:
# Perfrom label encoding on the label column
le = preprocessing.LabelEncoder()
for lang in languages:
    train_data[lang]['label'] = le.fit_transform(train_data[lang]['label'])
    dev_data[lang]['label'] = le.transform(dev_data[lang]['label'])
    test_data[lang]['label'] = le.transform(test_data[lang]['label'])



In [12]:
# print the first 5 rows of the hausa train data
print('Hausa Train Data')
train_data['hausa'].head()

Hausa Train Data


Unnamed: 0,tweet,label,sentiment_score
0,da kudin da arewa babu wani abin azo agani da ...,0,-1
1,kaga wani adu ar banda wai a haka shi ne shuga...,0,1
2,sai haquri fa yan madrid daman kunce champion ...,0,1
3,hmmm yanzu kai kasan girman allah daxakace muk...,0,-5
4,wai gwamno nin nigeria suna afa kwayoyi ko,0,-1


In [13]:
# %pip install nltk
# import nltk
# nltk.download('punkt')

# def tokenize_text(text):
#     return nltk.word_tokenize(text)

# # Apply the tokenize_text function to the train, test, dev data for the three languages
# for lang in languages:
#     train_data[lang]['tweet'] = train_data[lang]['tweet'].apply(tokenize_text)
#     dev_data[lang]['tweet'] = dev_data[lang]['tweet'].apply(tokenize_text)
#     test_data[lang]['tweet'] = test_data[lang]['tweet'].apply(tokenize_text)
    
# # print the first 5 rows of the hausa train data
# print('Hausa Train Data')
# train_data['hausa'].head()



In [14]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the preprocessed text data to obtain BoW features
X_train_bow = {}
X_dev_bow = {}
X_test_bow = {}

for lang in languages:
    X_train_bow[lang] = vectorizer.fit_transform(train_data[lang]['tweet'])
    X_dev_bow[lang] = vectorizer.transform(dev_data[lang]['tweet'])
    X_test_bow[lang] = vectorizer.transform(test_data[lang]['tweet'])

# Display the shape of the obtained feature matrices
for lang in languages:
    print(f"Shape of X_train_bow for {lang}:", X_train_bow[lang].shape)
    print(f"Shape of X_dev_bow for {lang}:", X_dev_bow[lang].shape)
    print(f"Shape of X_test_bow for {lang}:", X_test_bow[lang].shape)

Shape of X_train_bow for hausa: (14172, 21786)
Shape of X_dev_bow for hausa: (2677, 21786)
Shape of X_test_bow for hausa: (5303, 21786)
Shape of X_train_bow for igbo: (10192, 15502)
Shape of X_dev_bow for igbo: (1841, 15502)
Shape of X_test_bow for igbo: (3682, 15502)
Shape of X_train_bow for yoruba: (8522, 22316)
Shape of X_dev_bow for yoruba: (2090, 22316)
Shape of X_test_bow for yoruba: (4515, 22316)


In [15]:
# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed text data to obtain tf-idf features
X_train_tfidf = {}
X_dev_tfidf = {}
X_test_tfidf = {}

for lang in languages:
    X_train_tfidf[lang] = tfidf_vectorizer.fit_transform(train_data[lang]['tweet'])
    X_dev_tfidf[lang] = tfidf_vectorizer.transform(dev_data[lang]['tweet'])
    X_test_tfidf[lang] = tfidf_vectorizer.transform(test_data[lang]['tweet'])

# Display the shape of the obtained feature matrices
for lang in languages:
    print(f"Shape of X_train_tfidf for {lang}:", X_train_tfidf[lang].shape)
    print(f"Shape of X_dev_tfidf for {lang}:", X_dev_tfidf[lang].shape)
    print(f"Shape of X_test_tfidf for {lang}:", X_test_tfidf[lang].shape)

Shape of X_train_tfidf for hausa: (14172, 21786)
Shape of X_dev_tfidf for hausa: (2677, 21786)
Shape of X_test_tfidf for hausa: (5303, 21786)
Shape of X_train_tfidf for igbo: (10192, 15502)
Shape of X_dev_tfidf for igbo: (1841, 15502)
Shape of X_test_tfidf for igbo: (3682, 15502)
Shape of X_train_tfidf for yoruba: (8522, 22316)
Shape of X_dev_tfidf for yoruba: (2090, 22316)
Shape of X_test_tfidf for yoruba: (4515, 22316)


In [16]:
# # Create a function to include the sentiment score as a feature in the feature matrix
# def add_sentiment_score(X, data):
#     sentiment_score = data['sentiment_score'].values
#     if X.shape[0] != sentiment_score.shape[0]:
#         raise ValueError(f"Shape mismatch: X has shape {X.shape} but sentiment_score has shape {sentiment_score.shape}")
#     return np.concatenate((X.toarray(), sentiment_score.reshape(-1, 1)), axis=1)

# # Apply the add_sentiment_score function to the tf-idf feature matrices for the three languages
# X_train_tfidf_sentiment = {}
# X_dev_tfidf_sentiment = {}
# X_test_tfidf_sentiment = {}

# for lang in languages:
#     X_train_tfidf_sentiment[lang] = add_sentiment_score(X_train_tfidf[lang], train_data[lang])
#     X_dev_tfidf_sentiment[lang] = add_sentiment_score(X_dev_tfidf[lang], dev_data[lang])
#     X_test_tfidf_sentiment[lang] = add_sentiment_score(X_test_tfidf[lang], test_data[lang])
    
# # Display the shape of the obtained feature matrices
# for lang in languages:
#     print(f"Shape of X_train_tfidf_sentiment for {lang}:", X_train_tfidf_sentiment[lang].shape)
#     print(f"Shape of X_dev_tfidf_sentiment for {lang}:", X_dev_tfidf_sentiment[lang].shape)
#     print(f"Shape of X_test_tfidf_sentiment for {lang}:", X_test_tfidf_sentiment[lang].shape)



In [17]:
# Create a function to train a logistic regression model with the given feature matrix and labels
def train_logistic_regression(X, y):
    model = LogisticRegression(max_iter=1000)
    model.fit(X, y)
    return model

# Create a function to train a naive bayes model with the given feature matrix and labels
def train_naive_bayes(X, y):
    model = MultinomialNB()
    model.fit(X, y)
    return model

# Create a function to train a support vector machine model with the given feature matrix and labels
def train_svm(X, y):
    model = SVC()
    model.fit(X, y)
    return model

# Create a function to make predictions using the given model and feature matrix
def predict(model, X):
    return model.predict(X)

# Create a function to evaluate the given model using the given feature matrix and labels
def evaluate_model(model, X, y):
    y_pred = predict(model, X)
    return accuracy_score(y, y_pred)   

In [18]:
# Train logistic regression models using the tf-idf feature matrices for the three languages
logistic_regression_models = {}
for lang in languages:
    logistic_regression_models[lang] = train_logistic_regression(X_train_tfidf[lang], train_data[lang]['label'])
    
# Evaluate the logistic regression models using the dev data
logistic_regression_scores = {}
for lang in languages:
    logistic_regression_scores[lang] = evaluate_model(logistic_regression_models[lang], X_dev_tfidf[lang], dev_data[lang]['label'])
    print(f"Accuracy of logistic regression model for {lang}:", logistic_regression_scores[lang])

# Train naive bayes models using the tf-idf feature matrices for the three languages
naive_bayes_models = {}
for lang in languages:
    naive_bayes_models[lang] = train_naive_bayes(X_train_tfidf[lang], train_data[lang]['label'])

# Evaluate the naive bayes models using the dev data
naive_bayes_scores = {}
for lang in languages:
    naive_bayes_scores[lang] = evaluate_model(naive_bayes_models[lang], X_dev_tfidf[lang], dev_data[lang]['label'])
    print(f"Accuracy of naive bayes model for {lang}:", naive_bayes_scores[lang])



Accuracy of logistic regression model for hausa: 0.7411281285020546
Accuracy of logistic regression model for igbo: 0.77729494839761
Accuracy of logistic regression model for yoruba: 0.7272727272727273
Accuracy of naive bayes model for hausa: 0.711243929772133
Accuracy of naive bayes model for igbo: 0.7278652906029331
Accuracy of naive bayes model for yoruba: 0.6588516746411484


In [19]:
# Test the logistic regression models using the test data and print the classification report
logistic_regression_predictions = {}
for lang in languages:
    logistic_regression_predictions[lang] = predict(logistic_regression_models[lang], X_test_tfidf[lang])
    print(f"Classification report for logistic regression model for {lang}:\n", classification_report(test_data[lang]['label'], logistic_regression_predictions[lang]))

# Test the naive bayes models using the test data and print the classification report
naive_bayes_predictions = {}
for lang in languages:
    naive_bayes_predictions[lang] = predict(naive_bayes_models[lang], X_test_tfidf[lang])
    print(f"Classification report for naive bayes model for {lang}:\n", classification_report(test_data[lang]['label'], naive_bayes_predictions[lang]))
    


Classification report for logistic regression model for hausa:
               precision    recall  f1-score   support

           0       0.73      0.66      0.69      1759
           1       0.66      0.76      0.71      1789
           2       0.85      0.80      0.82      1755

    accuracy                           0.74      5303
   macro avg       0.75      0.74      0.74      5303
weighted avg       0.75      0.74      0.74      5303

Classification report for logistic regression model for igbo:
               precision    recall  f1-score   support

           0       0.80      0.61      0.70       943
           1       0.72      0.88      0.79      1621
           2       0.87      0.76      0.81      1118

    accuracy                           0.78      3682
   macro avg       0.80      0.75      0.77      3682
weighted avg       0.79      0.78      0.77      3682

Classification report for logistic regression model for yoruba:
               precision    recall  f1-score   

In [20]:
#  Create a function to tune the hyperparameters of the logistic regression model
def tune_logistic_regression(X, y):
    model = LogisticRegression(max_iter=1000)
    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
    grid = GridSearchCV(model, param_grid, cv=5)
    grid.fit(X, y)
    return grid.best_estimator_

# Create a function to tune the hyperparameters of the naive bayes model
def tune_naive_bayes(X, y):
    model = MultinomialNB()
    param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
    grid = GridSearchCV(model, param_grid, cv=5)
    grid.fit(X, y)
    return grid.best_estimator_

# Tune the hyperparameters of the logistic regression models using the tf-idf feature matrices for the three languages
tuned_logistic_regression_models = {}
for lang in languages:
    tuned_logistic_regression_models[lang] = tune_logistic_regression(X_train_tfidf[lang], train_data[lang]['label'])
    
# Evaluate the tuned logistic regression models using the dev data
tuned_logistic_regression_scores = {}
for lang in languages:
    tuned_logistic_regression_scores[lang] = evaluate_model(tuned_logistic_regression_models[lang], X_dev_tfidf[lang], dev_data[lang]['label'])
    print(f"Accuracy of tuned logistic regression model for {lang}:", tuned_logistic_regression_scores[lang])

# Tune the hyperparameters of the naive bayes models using the tf-idf feature matrices for the three languages
tuned_naive_bayes_models = {}
for lang in languages:
    tuned_naive_bayes_models[lang] = tune_naive_bayes(X_train_tfidf[lang], train_data[lang]['label'])

Accuracy of tuned logistic regression model for hausa: 0.7373926036608144
Accuracy of tuned logistic regression model for igbo: 0.7800108636610538
Accuracy of tuned logistic regression model for yoruba: 0.7421052631578947


In [21]:
# Test the tuned logistic regression models using the test data and print the classification report
tuned_logistic_regression_predictions = {}
for lang in languages:
    tuned_logistic_regression_predictions[lang] = predict(tuned_logistic_regression_models[lang], X_test_tfidf[lang])
    print(f"Classification report for tuned logistic regression model for {lang}:\n", classification_report(test_data[lang]['label'], tuned_logistic_regression_predictions[lang]))
    
# Test the tuned naive bayes models using the test data and print the classification report
tuned_naive_bayes_predictions = {}
for lang in languages:
    tuned_naive_bayes_predictions[lang] = predict(tuned_naive_bayes_models[lang], X_test_tfidf[lang])
    print(f"Classification report for tuned naive bayes model for {lang}:\n", classification_report(test_data[lang]['label'], tuned_naive_bayes_predictions[lang]))

Classification report for tuned logistic regression model for hausa:
               precision    recall  f1-score   support

           0       0.73      0.68      0.70      1759
           1       0.68      0.74      0.71      1789
           2       0.84      0.82      0.83      1755

    accuracy                           0.75      5303
   macro avg       0.75      0.75      0.75      5303
weighted avg       0.75      0.75      0.75      5303

Classification report for tuned logistic regression model for igbo:
               precision    recall  f1-score   support

           0       0.74      0.68      0.71       943
           1       0.75      0.82      0.78      1621
           2       0.83      0.78      0.81      1118

    accuracy                           0.77      3682
   macro avg       0.77      0.76      0.77      3682
weighted avg       0.77      0.77      0.77      3682

Classification report for tuned logistic regression model for yoruba:
               precision    r

In [22]:

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# Create a function to train a Random Forest model with the given feature matrix and labels
def train_random_forest(X, y):
    model = RandomForestClassifier()
    model.fit(X, y)
    return model

# Train Random Forest models using the tf-idf feature matrices for the three languages
random_forest_models = {}
for lang in languages:
    random_forest_models[lang] = train_random_forest(X_train_tfidf[lang], train_data[lang]['label'])

# Test the Random Forest models using the test data and print the classification report
random_forest_predictions = {}
for lang in languages:
    random_forest_predictions[lang] = predict(random_forest_models[lang], X_test_tfidf[lang])
    print(f"Classification report for Random Forest model for {lang}:\n", classification_report(test_data[lang]['label'], random_forest_predictions[lang]))

# Create a function to train a Gradient Boosting model with the given feature matrix and labels
def train_gradient_boosting(X, y):
    model = GradientBoostingClassifier()
    model.fit(X, y)
    return model

# Train Gradient Boosting models using the tf-idf feature matrices for the three languages
gradient_boosting_models = {}
for lang in languages:
    gradient_boosting_models[lang] = train_gradient_boosting(X_train_tfidf[lang], train_data[lang]['label'])

# Test the Gradient Boosting models using the test data and print the classification report
gradient_boosting_predictions = {}
for lang in languages:
    gradient_boosting_predictions[lang] = predict(gradient_boosting_models[lang], X_test_tfidf[lang])
    print(f"Classification report for Gradient Boosting model for {lang}:\n", classification_report(test_data[lang]['label'], gradient_boosting_predictions[lang]))
    
# Create a function to train a XGBoost model with the given feature matrix and labels
def train_xgboost(X, y):
    model = XGBClassifier()
    model.fit(X, y)
    return model

# Train XGBoost models using the tf-idf feature matrices for the three languages
xgboost_models = {}
for lang in languages:
    xgboost_models[lang] = train_xgboost(X_train_tfidf[lang], train_data[lang]['label'])

# Test the XGBoost models using the test data and print the classification report
xgboost_predictions = {}
for lang in languages:
    xgboost_predictions[lang] = predict(xgboost_models[lang], X_test_tfidf[lang])
    print(f"Classification report for XGBoost model for {lang}:\n", classification_report(test_data[lang]['label'], xgboost_predictions[lang]))



Classification report for Random Forest model for hausa:
               precision    recall  f1-score   support

           0       0.73      0.58      0.65      1759
           1       0.63      0.79      0.70      1789
           2       0.83      0.79      0.81      1755

    accuracy                           0.72      5303
   macro avg       0.73      0.72      0.72      5303
weighted avg       0.73      0.72      0.72      5303

Classification report for Random Forest model for igbo:
               precision    recall  f1-score   support

           0       0.77      0.59      0.67       943
           1       0.70      0.87      0.77      1621
           2       0.88      0.74      0.80      1118

    accuracy                           0.76      3682
   macro avg       0.78      0.73      0.75      3682
weighted avg       0.77      0.76      0.76      3682

Classification report for Random Forest model for yoruba:
               precision    recall  f1-score   support

         

In [23]:
# Create a function to tokenize the text data using the Tokenizer class
def tokenize_text(data, max_words):
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(data)
    return tokenizer

# Tokenize the text data using the Tokenizer class
max_words = 1000
tokenizers = {}
for lang in languages:
    tokenizers[lang] = tokenize_text(train_data[lang]['tweet'], max_words)

# Create a function to convert the text data to sequences using the Tokenizer class
def convert_to_sequences(tokenizer, data):
    return tokenizer.texts_to_sequences(data)

# Convert the text data to sequences using the Tokenizer class
X_train_sequences = {}
X_dev_sequences = {}
X_test_sequences = {}

for lang in languages:
    X_train_sequences[lang] = convert_to_sequences(tokenizers[lang], train_data[lang]['tweet'])
    X_dev_sequences[lang] = convert_to_sequences(tokenizers[lang], dev_data[lang]['tweet'])
    X_test_sequences[lang] = convert_to_sequences(tokenizers[lang], test_data[lang]['tweet'])

# Add padding to the sequences
maxlen = 100
X_train_sequences_padded = {}
X_dev_sequences_padded = {}
X_test_sequences_padded = {}

for lang in languages:
    X_train_sequences_padded[lang] = pad_sequences(X_train_sequences[lang], maxlen=maxlen)
    X_dev_sequences_padded[lang] = pad_sequences(X_dev_sequences[lang], maxlen=maxlen)
    X_test_sequences_padded[lang] = pad_sequences(X_test_sequences[lang], maxlen=maxlen)


# Create a function to one-hot encode the labels
def one_hot_encode_labels(y, num_classes):
    return tf.keras.utils.to_categorical(y, num_classes)

# One-hot encode the labels
num_classes = 3
y_train_one_hot = {}
y_dev_one_hot = {}
y_test_one_hot = {}

for lang in languages:
    y_train_one_hot[lang] = one_hot_encode_labels(train_data[lang]['label'], num_classes)
    y_dev_one_hot[lang] = one_hot_encode_labels(dev_data[lang]['label'], num_classes)
    y_test_one_hot[lang] = one_hot_encode_labels(test_data[lang]['label'], num_classes)



In [24]:
# Create a function to train a simple RNN model with the given feature matrix and labels
def train_simple_rnn(X, y, num_classes, max_words, maxlen):
    model = Sequential()
    model.add(Embedding(max_words, 32, input_length=maxlen))
    model.add(SimpleRNN(32))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2)
    return model

# Train simple RNN models using the sequences for the three languages
simple_rnn_models = {}
for lang in languages:
    simple_rnn_models[lang] = train_simple_rnn(X_train_sequences_padded[lang], y_train_one_hot[lang], num_classes, max_words, maxlen)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
# Evaluate the simple RNN models using the dev data and test data and print Accuracy and weighted F1 score
simple_rnn_scores = {}
simple_rnn_predictions = {}
for lang in languages:
    simple_rnn_scores[lang] = simple_rnn_models[lang].evaluate(X_dev_sequences_padded[lang], y_dev_one_hot[lang])
    simple_rnn_predictions[lang] = simple_rnn_models[lang].predict(X_test_sequences_padded[lang])
    print(f"Accuracy of simple RNN model for {lang}:", simple_rnn_scores[lang][1])
    print(f"Weighted F1 score of simple RNN model for {lang}:", metrics.f1_score(test_data[lang]['label'], np.argmax(simple_rnn_predictions[lang], axis=1), average='weighted'))

 1/84 [..............................] - ETA: 1s - loss: 1.4178 - accuracy: 0.5938

Accuracy of simple RNN model for hausa: 0.6271946430206299
Weighted F1 score of simple RNN model for hausa: 0.6471762893448432
Accuracy of simple RNN model for igbo: 0.7007061243057251
Weighted F1 score of simple RNN model for igbo: 0.6906287986238145
Accuracy of simple RNN model for yoruba: 0.6320574283599854
Weighted F1 score of simple RNN model for yoruba: 0.6110113738060609


In [33]:
# improve the simple RNN model by adding more layers
def train_improved_simple_rnn(X, y, num_classes, max_words, maxlen):
    model = Sequential()
    model.add(Embedding(max_words, 128, input_length=maxlen))
    model.add(SimpleRNN(64, kernel_regularizer=l2(0.01), return_sequences=True))
    model.add(Dropout(0.2))
    
    model.add(SimpleRNN(64))
    model.add(Dropout(0.3))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(X, y, epochs=10, batch_size=128, validation_split=0.2)
    return model

# Train improved simple RNN models using the sequences for the three languages
improved_simple_rnn_models = {}
for lang in languages:
    improved_simple_rnn_models[lang] = train_improved_simple_rnn(X_train_sequences_padded[lang], y_train_one_hot[lang], num_classes, max_words, maxlen)
    
# Evaluate the improved simple RNN models using the dev data and test data and print Accuracy and weighted F1 score
improved_simple_rnn_scores = {}
improved_simple_rnn_predictions = {}
for lang in languages:
    improved_simple_rnn_scores[lang] = improved_simple_rnn_models[lang].evaluate(X_dev_sequences_padded[lang], y_dev_one_hot[lang])
    improved_simple_rnn_predictions[lang] = improved_simple_rnn_models[lang].predict(X_test_sequences_padded[lang])
    print(f"Accuracy of improved simple RNN model for {lang}:", improved_simple_rnn_scores[lang][1])
    print(f"Weighted F1 score of improved simple RNN model for {lang}:", metrics.f1_score(test_data[lang]['label'], np.argmax(improved_simple_rnn_predictions[lang], axis=1), average='weighted'))



Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy of improved simple RNN model for hausa: 0.6454986929893494
Weighted F1 score of improved simple RNN model for hausa: 0.665480017667365
Accuracy of improved simple RNN model for igbo: 0.6985334157943726
Weighted F1 score of improved simple RNN model for igbo: 0.6976627440037287
Accuracy of improved simple RNN model for yoruba: 0.6200956702232361
Weighted F1 score of improved simple RNN model for yoruba: 0.5985030584713956


In [None]:
# Create a function 