In [1]:
import pandas as pd

In [3]:
df=pd.read_csv(r"C:\Users\hp\NLP_PROJECT\english_swahili_sentence_pairs.csv")

In [5]:
df

Unnamed: 0,English sentence,Swahili Translation
0,I am,mimi ni
1,U,wewe
2,him,yeye
3,her,yeye
4,you,wewe
...,...,...
210466,A significant number of corporate companies wi...,Idadi kubwa ya kampuni za ushirika zitashindan...
210467,They held their annual seminar to educate athl...,Walifanya semina yao ya kila mwaka kuelimisha ...
210468,Uganda will be sending a capable group to the ...,Uganda itakuwa ikituma kikundi chenye uwezo kw...
210469,The coach has confidence that his team will wi...,Kocha anajiamini kuwa timu yake itashinda mchezo.


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210471 entries, 0 to 210470
Data columns (total 2 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   English sentence     210470 non-null  object
 1   Swahili Translation  210470 non-null  object
dtypes: object(2)
memory usage: 3.2+ MB


In [9]:
#  missing values
df_cleaned = df.dropna()

#  Remove duplicate sentence pairs
df_cleaned = df_cleaned.drop_duplicates()


In [11]:

# Strip extra spaces from both columns
df_cleaned["English sentence"] = df_cleaned["English sentence"].str.strip()
df_cleaned["Swahili Translation"] = df_cleaned["Swahili Translation"].str.strip()


In [13]:
df_cleaned

Unnamed: 0,English sentence,Swahili Translation
0,I am,mimi ni
1,U,wewe
2,him,yeye
3,her,yeye
4,you,wewe
...,...,...
210109,The national army safeguards the country's bor...,Jeshi la Kitaifa linalinda mpaka wa nchi hiyo ...
210121,I participated in last year's motor rally cham...,Nilishiriki katika Mashindano ya Rally ya Moto...
210128,This court mainly deals with crimes committed ...,Korti hii inashughulikia uhalifu uliofanywa na...
210164,The company terminated their contract yesterday.,Kampuni hiyo ilisitisha mkataba wao jana.


In [15]:

#  Remove sentences that are too short (less than 2 characters)
df_cleaned = df_cleaned[df_cleaned["English sentence"].str.len() > 1]
df_cleaned = df_cleaned[df_cleaned["Swahili Translation"].str.len() > 1]


In [17]:

#  Convert to lowercase
df_cleaned["English sentence"] = df_cleaned["English sentence"].str.lower()
df_cleaned["Swahili Translation"] = df_cleaned["Swahili Translation"].str.lower()


In [19]:

df_cleaned.info(), df_cleaned.head()

<class 'pandas.core.frame.DataFrame'>
Index: 203326 entries, 0 to 210169
Data columns (total 2 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   English sentence     203326 non-null  object
 1   Swahili Translation  203326 non-null  object
dtypes: object(2)
memory usage: 4.7+ MB


(None,
   English sentence Swahili Translation
 0             i am             mimi ni
 2              him                yeye
 3              her                yeye
 4              you                wewe
 5               we                sisi)

In [21]:
import re

# Function to clean text by removing numbers and unwanted characters
def clean_text(text):
    text = re.sub(r"[^a-zA-Z.,?!'’ ]", "", text)  # Keep letters, spaces, and punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

# Apply the function to both columns
df_cleaned["English sentence"] = df_cleaned["English sentence"].apply(clean_text)
df_cleaned["Swahili Translation"] = df_cleaned["Swahili Translation"].apply(clean_text)

# Show sample cleaned data
df_cleaned.head()


Unnamed: 0,English sentence,Swahili Translation
0,i am,mimi ni
2,him,yeye
3,her,yeye
4,you,wewe
5,we,sisi


In [23]:
df_cleaned.tail()


Unnamed: 0,English sentence,Swahili Translation
210109,the national army safeguards the country's bor...,jeshi la kitaifa linalinda mpaka wa nchi hiyo ...
210121,i participated in last year's motor rally cham...,nilishiriki katika mashindano ya rally ya moto...
210128,this court mainly deals with crimes committed ...,korti hii inashughulikia uhalifu uliofanywa na...
210164,the company terminated their contract yesterday.,kampuni hiyo ilisitisha mkataba wao jana.
210169,the ministry of health has posted five health ...,wizara ya afya imeweka wafanyikazi watano wa a...


In [25]:
# Function to format text properly
def format_sentence(text):
    if len(text) == 0:
        return text  # Skip empty strings
    text = text.capitalize()  # Ensure first letter is uppercase
    if text[-1] not in ".?!":  # Ensure proper sentence ending
        text += "."  
    return text

# Apply formatting to both columns
df_cleaned["English sentence"] = df_cleaned["English sentence"].apply(format_sentence)
df_cleaned["Swahili Translation"] = df_cleaned["Swahili Translation"].apply(format_sentence)

# Show formatted sentences
df_cleaned.tail()


Unnamed: 0,English sentence,Swahili Translation
210109,The national army safeguards the country's bor...,Jeshi la kitaifa linalinda mpaka wa nchi hiyo ...
210121,I participated in last year's motor rally cham...,Nilishiriki katika mashindano ya rally ya moto...
210128,This court mainly deals with crimes committed ...,Korti hii inashughulikia uhalifu uliofanywa na...
210164,The company terminated their contract yesterday.,Kampuni hiyo ilisitisha mkataba wao jana.
210169,The ministry of health has posted five health ...,Wizara ya afya imeweka wafanyikazi watano wa a...


**Checking wordlength imbalance**

In [28]:
# Compute word count for each sentence
df_cleaned["english_length"] = df_cleaned["English sentence"].apply(lambda x: len(x.split()))
df_cleaned["swahili_length"] = df_cleaned["Swahili Translation"].apply(lambda x: len(x.split()))

# Compute length ratio (English words / Swahili words)
df_cleaned["length_ratio"] = df_cleaned["english_length"] / df_cleaned["swahili_length"]

# Remove extreme imbalances (e.g., ratio > 3 or < 0.3)
df_cleaned = df_cleaned[(df_cleaned["length_ratio"] < 3) & (df_cleaned["length_ratio"] > 0.3)]



In [30]:
df_cleaned

Unnamed: 0,English sentence,Swahili Translation,english_length,swahili_length,length_ratio
0,I am.,Mimi ni.,2,2,1.000000
2,Him.,Yeye.,1,1,1.000000
3,Her.,Yeye.,1,1,1.000000
4,You.,Wewe.,1,1,1.000000
5,We.,Sisi.,1,1,1.000000
...,...,...,...,...,...
210109,The national army safeguards the country's bor...,Jeshi la kitaifa linalinda mpaka wa nchi hiyo ...,11,14,0.785714
210121,I participated in last year's motor rally cham...,Nilishiriki katika mashindano ya rally ya moto...,8,11,0.727273
210128,This court mainly deals with crimes committed ...,Korti hii inashughulikia uhalifu uliofanywa na...,10,9,1.111111
210164,The company terminated their contract yesterday.,Kampuni hiyo ilisitisha mkataba wao jana.,6,6,1.000000


In [32]:

# Drop helper columns
df_cleaned = df_cleaned.drop(columns=["english_length", "swahili_length", "length_ratio"])

# Show cleaned data
df_cleaned

Unnamed: 0,English sentence,Swahili Translation
0,I am.,Mimi ni.
2,Him.,Yeye.
3,Her.,Yeye.
4,You.,Wewe.
5,We.,Sisi.
...,...,...
210109,The national army safeguards the country's bor...,Jeshi la kitaifa linalinda mpaka wa nchi hiyo ...
210121,I participated in last year's motor rally cham...,Nilishiriki katika mashindano ya rally ya moto...
210128,This court mainly deals with crimes committed ...,Korti hii inashughulikia uhalifu uliofanywa na...
210164,The company terminated their contract yesterday.,Kampuni hiyo ilisitisha mkataba wao jana.


In [46]:
# Check for empty sentences after cleaning
df_cleaned = df_cleaned[(df_cleaned["English sentence"].str.len() > 1) & (df_cleaned["Swahili Translation"].str.len() > 1)]


In [48]:
df_cleaned.to_csv('english_swahili_sentence_pairs_cleaned.csv',index=False)

In [50]:
df=pd.read_csv(r"C:\Users\hp\NLP_PROJECT\english_swahili_sentence_pairs_cleaned.csv")

In [52]:
df

Unnamed: 0,English sentence,Swahili Translation
0,I am.,Mimi ni.
1,Him.,Yeye.
2,Her.,Yeye.
3,You.,Wewe.
4,We.,Sisi.
...,...,...
198953,The national army safeguards the country's bor...,Jeshi la kitaifa linalinda mpaka wa nchi hiyo ...
198954,I participated in last year's motor rally cham...,Nilishiriki katika mashindano ya rally ya moto...
198955,This court mainly deals with crimes committed ...,Korti hii inashughulikia uhalifu uliofanywa na...
198956,The company terminated their contract yesterday.,Kampuni hiyo ilisitisha mkataba wao jana.
