In [1]:
def assert_col_of_df(df: pd.DataFrame, column: Union[List[str], str]) -> None:
    """Helper function to assert that a column `col` is a column of `df`.
    
    Args:
        df: Dataframe.
        col: String value to test.
    
    Returns:
        None.
        
    Raises:
        ValueError if `col` is not a column of `df`.
    """
    #ensures col is a list
    if isinstance(column, str):
        column = [column]
    
    for c in column:
        try:
            assert c in df.columns
        except AssertionError:
            raise ValueError(f"Invalid input value. Column {c} is not a column of df.")

NameError: name 'pd' is not defined

# Replace_arabic_to_english Function

In [None]:
def replace_arabic_to_english(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """
    Replaces specific Arabic words with English words in a given column of the DataFrame.

    Args:
        df: The DataFrame containing the column to replace values in.
        column: The name of the column to perform replacements on.

    Returns:
        A DataFrame with the specified column having Arabic words replaced by English words.

    Raises:
        ValueError: If the specified column is not found in the DataFrame.
    """
    # Check if the column exists in the DataFrame
    if column not in df.columns:
        raise ValueError(f"Column '{column}' is not in the DataFrame")

    # Dictionary of replacements
    replacements = {
        'تدهور': 'Deterioration',
        'صدم جانبي': 'Side Impact',
        'صدم خلفي': 'Rear Collision',
        'صدم عمودي': 'Vertical impact',
        'دهس إنسان': 'Run over a person',
        'صدم جسم ثابت خارج الطريق': 'Hitting a stationary object off the road',
        'صدم متتالي': 'Consecutive shocks',
        'أخرى(حدد)': 'Other (specify)',
        'صدم متقابل': 'Opposite collision',
        'صدم جسم ثابت في الطريق': 'Hit a fixed object on the road',
        'صدم جسم غير ثابت في الطريق': 'Hitting an unstable object on the road',
        'صدم جسم غير ثابت خارج الطريق': 'Hitting a stationary object off the road',
        'صدم إثناء الدوران': 'Impact while turning',
        'غير معروف': 'Unknown',
        'سقوط': 'Shoulder Edge Dropp Off',
        'صدم حيوان': 'Hit an Animal',
        'إحتراق': 'Car Burning',
        'صدم غير متحرك': 'Bump on the road'
    }

    # Replace values in the specified column
    df[column].replace(replacements, inplace=True)
    
    return df

# Google_translating_columns function

In [None]:
import pandas as pd
from typing import Union, List
from googletrans import Translator

def google_translating_columns(df: pd.DataFrame, columns: Union[List[str], str]) -> pd.DataFrame:
    """
    Translates specific columns from Arabic to English in the DataFrame and then refines the
    translations using a dictionary of manual translations.

    Args:
        df: The DataFrame containing the columns to translate.
        columns: A column name or list of column names to translate.

    Returns:
        A DataFrame with the specified columns translated to English and refined.

    Raises:
        ValueError: If any of the specified columns are not found in the DataFrame.
    """
    # Ensure columns is a list
    if isinstance(columns, str):
        columns = [columns]

    # Check if all columns to translate exist in the DataFrame
    for column in columns:
        if column not in df.columns:
            raise ValueError(f"Column '{column}' is not in the DataFrame")

    # Initialize the translator
    translator = Translator()

    # Dictionary for manual translations
    manual_translations = {
        # Rep Kind
        'Causal accident': 'Traffic accident with injuries',
        # City
        'Eye area': 'Al Ain Region',
        'Al Dhafra area': 'Al Dhafra Region',
        # Reason
        'Not to make the road / not to give the brigades': 'Not Giving Way',
        'Overcoming the red light signal': 'Running a red light',
        'Not adhering to the mandatory itinerary': 'Failure to adhere to the mandatory lane',
        'Preoccupation with the road while driving the vehicle in any way': 'Being distracted from the road while driving a vehicle in any way',
        'Preoccupation with the road while driving the vehicle using the phone': 'Being distracted from the road while driving using the phone',
        'Vision': 'obstructed visibility',
        'Not giving priority to the infantry crossing': 'not giving priority to pedestrian crossings',
        'Sir in the opposite direction': 'Driving in the opposite direction',
        'Lack of leadership in leadership': 'lack of driving knowledge',
        'Disadvantages on the road (select)': 'Defects in the road (specify)',
        'Disadvantages in the vehicle (select)': 'Defects in the vehicle (specify)',
        'Overcoming in a place where transgression is forbidden': 'Overtaking in a place where overtaking is prohibited',
        'Glow': 'Sun Glare',
        'Noning other traffic signals': 'Failure to comply with other traffic signals',
        'Mechanical malfunction while walking': 'Mechanical failure occurs while driving',
        'Shock': 'Vertical Impact',
        'Found': 'Wet',
        'temple': 'Paved',
        'Sand': 'Covered with Sand',
        'Non -temple': 'UnPaved',
        'Sahu': 'Clear',
        'Wake': 'Clear'
    }

    # Perform translation
    for column in columns:
        unique_elements = df[column].unique()
        translated_elements = {}
        
        for element in unique_elements:
            try:
                translation = translator.translate(element, src='ar', dest='en').text
                translated_elements[element] = translation
            except Exception as e:
                print(f"Translation failed for element '{element}': {e}")
                translated_elements[element] = element  # Fallback to original element in case of failure

        # Replace the elements in the DataFrame with the translated elements
        df[column] = df[column].replace(translated_elements)

    # Refine translations using the manual translations dictionary
    for column in columns:
        for arabic_text, manual_translation in manual_translations.items():
            df[column] = df[column].replace(arabic_text, manual_translation)

    return df

# label_encoder fuunction

In [None]:
def label_encoder(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """
    Performs label encoding on a specific column of the DataFrame.

    Args:
        df: The DataFrame containing the column to encode.
        column: The name of the column to label encode.

    Returns:
        A DataFrame with the specified column label encoded.

    Raises:
        ValueError: If the specified column is not found in the DataFrame.
    """
    
    # Check the column to encode exist in the dataframe 
    if column not in df.columns:
        raise ValueError(f"Column '{column}' is not in the DataFrame")
        
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    
    return df
## Traffic accident with injuries =0 and Traffic accident without injuries =1

# one_hot_encoding_columns function

In [None]:
def one_hot_encoding_columns(df: pd.DataFrame, columns: Union[List[str], str]) -> pd.DataFrame:
    """
    Performs one-hot encoding on specific columns of the DataFrame.

    Args:
        df: The DataFrame containing the columns to encode.
        columns: A column name or list of column names to one-hot encode.

    Returns:
        A DataFrame with the specified columns one-hot encoded.

    Raises:
        ValueError: If any of the specified columns are not found in the DataFrame.
    """
    # Ensure columns is a list
    if isinstance(columns, str):
        columns = [columns]

    # Check if all columns to encode exist in the DataFrame
    for column in columns:
        if column not in df.columns:
            raise ValueError(f"Column '{column}' is not in the DataFrame")

    # Perform one-hot encoding with dtype=int
    df_encoded = pd.get_dummies(df, columns=columns, dtype=int)
    return df_encoded

# Train and Test split

In [None]:
def get_train_test_split(
        df: pd.DataFrame,
        stratify_col: str
        ) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Splits the input data frame into a training and test set with stratification.

    Args:
        df: Raw input data.
        stratify_col: Column to use for stratification.

    Returns:
        Tuple[pd.DataFrame]: Raw train and test data splits.
    """
    logger = logging.getLogger(__name__)
    
    assert_col_of_df(df, stratify_col)
    
    # Perform stratified split with test size of 20%
    df_train, df_test = train_test_split(
        df,
        test_size=0.2,
        stratify=df[stratify_col],
        random_state=42
    )
    
    logger.info("Train set size:", len(df_train))
    logger.info("Test set size:", len(df_test))
    
    return df_train, df_test


# Wrap_transform data which call the above functions

In [None]:
def wrap_transform_data(
    df: pd.DataFrame,
    rep_type: str,
    arab_english: List[str],
    features: List[str],
    target: str
    ) -> pd.DataFrame:
    """Wrapper for transforming the data for the model
    
    Processing is applied in the following steps:
        1. Replace Arabic to english.
        2. Google translation
        3. Label Encoder
        4. One hot encoding columns
        5. Train and test split
        5. Select only relevant columns
    
    Returns:
        pd.DataFrame: Transformed dataframe.
    """
    
    
    # 1. Google Translation
    df = google_translating_columns(df, columns= arab_english)
    
    # 2. Replace Arabic to English
    df = replace_arabic_to_english(df, column= rep_type)
    
    # 3. Label Encoder
    df = label_encoder(df, column=target)
    
    # 4. One Hot Encoding Columns
    df = one_hot_encoding_columns(df, columns=rest_columns)
    
    # 5. Select Only Relevant Columns
    df = df[[target] + features]
    
    return df