# Explore the two datasources - Traffic violations

For the same topic, two different versions of the same dataset were identified, which were both published on openML. 

The first is unprocessed and can be found [here](https://api.openml.org/d/42132). The second one is a preprocessed and subsampled version that can be downloaded [here](https://www.openml.org/search?type=data&status=active&sort=runs&order=desc&id=42345). 

In [None]:
from pathlib import Path
from typing import List, Tuple

import datetime
import gensim
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import pyLDAvis.gensim
import seaborn as sns
import spacy
import string

from gensim import corpora
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
from scipy.io.arff import loadarff
from scipy.io.arff._arffread import MetaData

from src.data_preprocessing.data_preprocessor import (
    load_data,
    change_to_numeric,
    feature_engineering,
    transform_label,
    convert_float_to_int,
    drop_cols,
    filter_na,
)

In [None]:
FULL_DATA_PATH = Path("../data/dataset.csv")
DATA_PATH = Path("../data/file65ef3a759daf.arff")

## Preprocessed dataset

In [None]:
data, meta = loadarff(DATA_PATH)
data = pd.DataFrame(data)

In [None]:
print("Dimensions of the dataset:", data.shape)

In [None]:
# remove b string from data
str_df = data.select_dtypes([object])
str_df = str_df.stack().str.decode("utf-8").unstack()
data = pd.concat([str_df, data.select_dtypes(exclude=[object])], axis=1)

In [None]:
# def load_data(path_: Path) -> Tuple[pd.DataFrame, MetaData]:
#     """Loads the .arff file (incl. metadata) and converts to utf-8.

#     Parameters
#     -------
#     path_ : Path
#             Path of the data.

#     Returns
#     -------
#     data : pd.DataFrame
#             Data as a dataframe.
#     meta : scipy.io.arff._arffread.Metadata
#             Metadata of the dataset.
#     """
#     # load df and metadata from .arff
#     data, meta = loadarff(path_)
#     data = pd.DataFrame(data)

#     # remove b string from data
#     str_df = data.select_dtypes([object])
#     str_df = str_df.reset_index().melt(id_vars="index").set_index("index")
#     str_df["value"] = str_df["value"].str.decode("utf-8")

#     # rename the 'value' column to avoid conflicts and perform pivot
#     str_df = str_df.rename(columns={"value": "decoded_value"})
#     str_df = pd.pivot_table(
#         str_df, columns="variable", values="decoded_value", index="index", aggfunc=lambda x: x
#     )

#     # reset both the column and index names to None
#     str_df = str_df.rename_axis(index=None, columns=None)

#     # merge str and non-str columns
#     data = pd.concat([str_df, data.select_dtypes(exclude=[object])], axis=1)

#     return data, meta

In [None]:
# def change_to_numeric(df: pd.DataFrame) -> pd.DataFrame:
#     """Change yes/no values in columns to 0/1.

#     Parameters
#     -------
#     df : pd.DataFrame
#             Data to transform.

#     Returns
#     -------
#     df : pd.DataFrame
#             Transformed data.
#     """
#     for col in df.columns:
#         # only change columns that have no missing values to 0 / 1
#         if set(df[col].unique().tolist()) - set(["No", "Yes"]) == set():
#             df[col] = df[col].map(dict(Yes=1, No=0))

#     return df

In [None]:
# def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
#     """Creates new features based and drops the old ones.

#         - If the Vehicle's State is Maryland (MD)
#         - If the Driver's State is Maryland (MD)
#         - If the Driver License's State is Maryland (MD)

#     Parameters
#     -------
#     df : pd.DataFrame
#             Data to transform.

#     Returns
#     -------
#     df : pd.DataFrame
#             Transformed data.
#     """
#     # Vehicle's State
#     df["State_MD"] = (df["State"] == "MD").astype(int)
#     df.loc[df["State"] == "?", "State_MD"] = -1

#     # Driver's State
#     df["Driver_State_MD"] = (df["Driver.State"] == "MD").astype(int)
#     df.loc[df["Driver.State"] == "?", "Driver_State_MD"] = -1

#     # Driver License's State
#     df["DL_State_MD"] = (df["DL.State"] == "MD").astype(int)
#     df.loc[df["DL.State"] == "?", "DL_State_MD"] = -1

#     df = df.drop(columns=["State", "Driver.State", "DL.State"])
#     return df

In [None]:
# def transform_label(df: pd.DataFrame) -> pd.DataFrame:
#     """Drops rows that are equal to SERO and changes label to "Citation" with 0/1 - values.

#     Parameters
#     -------
#     df : pd.DataFrame
#             Data to transform.

#     Returns
#     -------
#     df : pd.DataFrame
#             Transformed data.
#     """
#     df = df[df["Violation.Type"] != "SERO"].copy()
#     df["Citation"] = df.loc[:, "Violation.Type"].apply(lambda x: 1 if x == "Citation" else 0)
#     df = df.drop(columns=["Violation.Type"])
#     return df

In [None]:
# def convert_float_to_int(df: pd.DataFrame, column_name: str = "Year") -> pd.DataFrame:
#     """If possible to convert float to int converts to int.

#     Parameters
#     -------
#     df : pd.DataFrame
#             Data to transform.
#     column_name : str
#             Column that should be converted to int.

#     Returns
#     -------
#     df : pd.DataFrame
#             Transformed data.
#     """
#     assert (
#         df[column_name].dropna().apply(lambda x: x.is_integer()).all()
#     ), "Can't be converted to int"
#     df[column_name] = df[column_name].fillna(-1).astype(int)

#     # get today's year and filter by -1 (na), above 1990 or below/equal today's year
#     year = int(datetime.date.today().strftime("%Y"))
#     df = df[(df[column_name] == -1) | ((df[column_name] > 1990) & (df[column_name] <= year))]

#     return df

In [None]:
# def drop_cols(df: pd.DataFrame, cols: List[str]) -> pd.DataFrame:
#     """Drop columns from dataframe.

#     Parameters
#     -------
#     df : pd.DataFrame
#             Data to transform.
#     cols : List
#             List of columns to drop.

#     Returns
#     -------
#     df : pd.DataFrame
#             Transformed data.
#     """
#     df = df.drop(columns=cols)

#     return df

In [None]:
# def filter_na(df: pd.DataFrame) -> pd.DataFrame:
#     """Filter na-values in full df.

#     Since analysis showed that values are missing at random
#     across groups, make less than 1% of instances and instance normally has
#     multiple missing feature values, this operation is valid.

#     Parameters
#     -------
#     df : pd.DataFrame
#             Data to transform.

#     Returns
#     -------
#     df : pd.DataFrame
#             Transformed data.
#     """
#     # filter out -1 in integer columns
#     cols = df.select_dtypes([int]).columns
#     for col in cols:
#         df = df[df[col] != -1].copy()

#     # filter out ? in string columns
#     cols = df.select_dtypes([object]).columns
#     for col in cols:
#         df = df[~df[col].isin(["U", "?"])].copy()

#     return df

In [None]:
def preprocessor(data_path: Path, cols: List[str]) -> pd.DataFrame:
    """Load data and perform preprocessing steps.

    Parameters
    -------
    path_ : Path
            Path of the data.
    cols : List
            List of columns to drop.

    Returns
    -------
    data : pd.DataFrame
            Processed data.
    """
    # load the data
    data, _ = load_data(data_path)

    # convert yes/no to 0/1 and year to int
    data = change_to_numeric(data)
    data = convert_float_to_int(data)

    # perform feature engineering on state columns
    data = feature_engineering(data)

    # drop unwished cols
    data = drop_cols(data, cols)

    # transform label to 0/1 for citation
    data = transform_label(data)

    # filter na
    data = filter_na(data)

    return data

In [None]:
# apply preprocessing to data
data = preprocessor(
    DATA_PATH, ["Model", "Charge", "Driver.City", "Arrest.Type", "Commercial.Vehicle"]
)

In [None]:
# load stopwords and en_core_web_sm used in preprocess_text
nltk.download("stopwords")
nlp = spacy.load("en_core_web_sm")

In [None]:
def preprocess_text(df: pd.DataFrame, column_name: str = "Description") -> pd.DataFrame:
    """reformats text so that LDA can be applied in the next step

    Parameters
    -------
    df : pd.DataFrame
            Data to transform.
    column_name : (str, optional)
            Name of column to be transformed.
            Defaults to "Description".

    Returns
    -------
    df : pd.DataFrame
            Transformed data.
    """
    # making text lower case
    lowercase_text = df[column_name].apply(lambda x: x.lower())

    # tokenize text
    tokenized_text = lowercase_text.apply(lambda x: nltk.word_tokenize(x))

    # removing stopwords
    stop_words = set(stopwords.words("english"))
    clean_text = tokenized_text.apply(
        lambda tokens: [
            word for word in tokens if word not in stop_words and word not in string.punctuation
        ]
    )

    # lemmatize words
    lemmatized_text = clean_text.apply(
        lambda tokens: [token.lemma_ for token in nlp(" ".join(tokens))]
    )

    df["description_clean"] = lemmatized_text.apply(lambda lem_tokens: " ".join(lem_tokens))

    return df

In [None]:
data = preprocess_text(data)

In [None]:
tokenized_text = data["description_clean"].apply(lambda x: x.split())
dictionary = corpora.Dictionary(tokenized_text)
corpus = [dictionary.doc2bow(text) for text in tokenized_text]

topic_range = range(2, 21)

models = []
coherence_scores = []

for n_topics in topic_range:
    lda_model = gensim.models.LdaModel(
        corpus=corpus, id2word=dictionary, num_topics=n_topics, passes=10
    )
    models.append(lda_model)
    coherence_model = CoherenceModel(
        model=lda_model,
        texts=data["description_clean"].str.split(),
        dictionary=dictionary,
        coherence="c_v",
    )
    coherence_scores.append(coherence_model.get_coherence())

In [None]:
# Plot the coherence scores
plt.plot(topic_range, coherence_scores)
plt.xticks(np.arange(2, 21, step=1))
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.title("Coherence Score vs. Number of Topics")
plt.show()

In [None]:
def create_n_topics(
    df: pd.DataFrame,
    column_name: str = "description_clean",
    n_topics: int = 10,
) -> pd.DataFrame:
    """Applies LDA to a text column of DF and adds LDA topic distributions as new features.

    Parameters
    ----------
    df : pd.DataFrame
        Data to transform.

    column_name : str, optional
        Name of the column to be transformed. Defaults to "description_clean".

    num_topics : int, optional
        Number of topics for LDA. Defaults to 10.

    Returns
    -------
    df : pd.DataFrame
        Transformed data with added LDA topic features.
    """

    # Preprocess text
    df = preprocess_text(df, column_name)

    # Tokenized text is already available from preprocessing but needs to be list of list
    tokenized_text = df[column_name].apply(lambda x: x.split())

    # Create a Gensim dictionary and corpus
    dictionary = corpora.Dictionary(tokenized_text)
    corpus = [dictionary.doc2bow(text) for text in tokenized_text]

    # Train an LDA model
    lda_model = gensim.models.LdaModel(
        corpus=corpus, id2word=dictionary, num_topics=n_topics, passes=10
    )

    # Extract LDA topics
    topics = lda_model[corpus]

    for topic in lda_model[corpus]:
        print(topic)

    print(f"Number of topics extracted: {len(topics)}")

    # Add LDA topic distributions as new features
    for i in range(n_topics):
        df[f"Topic_{i+1}"] = [topic[i][1] if i < len(topic) else 0 for topic in topics]

    return df, lda_model, corpus, dictionary

In [None]:
data, lda_model, corpus, dictionary = create_n_topics(
    data, column_name="description_clean", n_topics=3
)

In [None]:
print(data.head())

In [None]:
for i in range(3):
    print(i)

In [None]:
plt.figure(figsize=(8, 6))
for i in range(3):
    plt.hist(data[f"Topic_{i+1}"], alpha=0.5, label=f"Topic_{i+1}", bins=20)

plt.xlabel("Values")
plt.ylabel("Frequency")
plt.title("Distribution of Topics")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# visualing the topic seperation using pyLDAvis
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

In [None]:
def threshold_transform(data_frame: pd.DataFrame, t: float, columns: list[str]) -> pd.DataFrame:
    """
    Transform specified columns in a DataFrame based on a threshold 't'.

    Parameters
    -------
        data_frame : pd.DataFrame
            The DataFrame to be transformed.
        t : float
            The threshold value.
        columns_to_transform : list
            List of column names to be transformed.

    Returns
    -------
        pandas.DataFrame: A new DataFrame with specified columns transformed.
    """
    transformed_df = data_frame.copy()

    for column in columns:
        transformed_df[column] = (transformed_df[column] >= t).astype(int)

    return transformed_df

In [None]:
data = threshold_transform(data, t=0.334, columns=["Topic_1", "Topic_2", "Topic_3"])

In [None]:
data.head()

In [None]:
df_numeric = data.select_dtypes(exclude=[object])
# shows correlation of numeric features
corr = df_numeric.corr()
corr

# create heat map of correlation plot
sns.heatmap(corr)
plt.title("Correlation plot of numerical features")