In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import pandas as pd
from typing import Optional
import json
import ast
from typing import Iterable

### Heading: preprocess_behaviors
<br> Purpose: load and normalize the MIND 'behaviors.tsv' file into a structured DataFrame.

This function reads the behaviors.tsv file and returns a DataFrame with one row per
impression (optionally exploded), with parsed fields and helpful columns:
- impression_time: datetime when the user session occurred
- user_id: user identifier
- history: list of clicked news ids (empty list if '-')
- impression_id: the original impression id (if present in your dataset)
- news_id: id of the news item shown in this impression
- label: 1 if the news was clicked, 0 otherwise
- raw_impression: the original impression string (news_id-label pairs)

The function has options to:
- explode_impressions: return one row per (news_id, label) in the impression list
- keep_history_as_str: if True, history remains a string; else parsed into list

Usage example:

```
df = preprocess_behaviors("/workspace/data/MIND_small/MINDsmall_train/behaviors.tsv", explode_impressions=True)

```

In [None]:
def preprocess_behaviors(path: str,
                         nrows: Optional[int] = None,
                         sep: str = "\t",
                         explode_impressions: bool = True,
                         time_format: Optional[str] = None,
                         keep_history_as_str: bool = False) -> pd.DataFrame:
    """Load and preprocess a MIND `behaviors.tsv` file.

    Parameters
    - path: path to behaviors.tsv
    - nrows: optional number of rows to read (useful for testing)
    - sep: file separator (default: '\t')
    - explode_impressions: if True, return one row per impression (news, label)
    - time_format: optional strptime format for impression time; if None, pandas will try to parse
    - keep_history_as_str: if True, don't convert history to list; else convert '-' to []

    Returns
    - DataFrame
      If explode_impressions is True, columns are:
        ['impression_time','user_id','history','impression_id','news_id','label','raw_impression']
      Else, impressions remain as the raw string in column 'impressions'.

    Notes
    - The MIND behaviors file typically has lines: [impression_time]\t[user_id]\t[history]\t[impressions]
      where `history` is a space-separated list of clicked news ids or '-' and `impressions` is
      a space-separated list of "newsid-0/1" tokens indicating whether the item was clicked.
    - Some MIND files include an initial impression id column. This function will detect that
      and adjust column names accordingly.
    """
    # Detect whether the file has an impression_id as the first column.
    # Common layouts:
    # 1) impression_time \t user_id \t history \t impressions
    # 2) impression_id \t user_id \t impression_time \t history \t impressions
    # We'll read the first non-empty line and count fields to decide.
    sample_line = None
    with open(path, 'r', encoding='utf-8', errors='replace') as fh:
        for line in fh:
            if line.strip():
                sample_line = line.strip()
                break

    if sample_line is None:
        raise ValueError(f"No data found in behaviors file: {path}")

    parts = sample_line.split(sep)
    if len(parts) >= 5:
        # layout with impression_id at start
        col_names = ['impression_id', 'user_id', 'impression_time', 'history', 'impressions']
    else:
        col_names = ['impression_time', 'user_id', 'history', 'impressions']

    # Load using the inferred column names
    df = pd.read_csv(path, sep=sep, header=None, nrows=nrows, names=col_names, dtype=str)

    # Parse time. Depending on column layout, impression_time may be in a different position.
    if 'impression_time' in df.columns:
        if time_format:
            df['impression_time'] = pd.to_datetime(df['impression_time'], format=time_format, errors='coerce')
        else:
            df['impression_time'] = pd.to_datetime(df['impression_time'], errors='coerce')
    else:
        # If we loaded a layout without an explicit impression_time column (unexpected), create it as NaT
        df['impression_time'] = pd.NaT

    # Normalize history
    if keep_history_as_str:
        if 'history' in df.columns:
            df['history'] = df['history'].fillna('-')
        else:
            df['history'] = '-'
    else:
        def parse_history(x):
            if pd.isna(x) or str(x).strip() == '-' or str(x).strip() == '':
                return []
            return str(x).split()
        if 'history' in df.columns:
            df['history'] = df['history'].apply(parse_history)
        else:
            df['history'] = [[] for _ in range(len(df))]

    # Keep raw impressions
    if 'impressions' in df.columns:
        df['raw_impression'] = df['impressions'].fillna('')
    else:
        df['raw_impression'] = ''

    if explode_impressions:
        # Split impressions into list of 'newsid-label'
        def expand_impr(row):
            items = []
            raw = row['raw_impression'].strip()
            if raw == '':
                return pd.DataFrame([
                    {
                        'impression_time': row['impression_time'],
                        'user_id': row.get('user_id', None),
                        'history': row['history'],
                        'impression_id': row.get('impression_id', None) if 'impression_id' in row.index else None,
                        'news_id': None,
                        'label': None,
                        'raw_impression': raw
                    }
                ])
            for token in raw.split():
                if '-' in token:
                    news, lbl = token.rsplit('-', 1)
                    try:
                        lbl = int(lbl)
                    except ValueError:
                        lbl = None
                else:
                    news = token
                    lbl = None
                items.append({
                    'impression_time': row['impression_time'],
                    'user_id': row.get('user_id', None),
                    'history': row['history'],
                    'impression_id': row.get('impression_id', None) if 'impression_id' in row.index else None,
                    'news_id': news,
                    'label': lbl,
                    'raw_impression': raw
                })
            return pd.DataFrame(items)

        # Apply expansion row-wise and concat
        exploded = pd.concat(df.apply(expand_impr, axis=1).tolist(), ignore_index=True)
        # Ensure types
        exploded['label'] = exploded['label'].astype('Int64')
        return exploded
    else:
        # Return original with parsed history
        return df


In [10]:
# Example usage: load and preview exploded impressions (first 5 users -> exploded rows)
from pathlib import Path
train_path = '/workspace/data/MIND_small/MINDsmall_train/behaviors.tsv'
if Path(train_path).exists():
    df_sample = preprocess_behaviors(train_path, nrows=5, explode_impressions=True)
    #explode impressions = true to get one row per impression
    print('Sample exploded rows:', df_sample.shape)
    display(df_sample.head())
else:
    print('Train behaviors file not found at', train_path)


Sample exploded rows: (122, 7)


Unnamed: 0,impression_time,user_id,history,impression_id,news_id,label,raw_impression
0,2019-11-11 09:05:58,U13740,"[N55189, N42782, N34694, N45794, N18445, N6330...",1,N55689,1,N55689-1 N35729-0
1,2019-11-11 09:05:58,U13740,"[N55189, N42782, N34694, N45794, N18445, N6330...",1,N35729,0,N55689-1 N35729-0
2,2019-11-12 18:11:30,U91836,"[N31739, N6072, N63045, N23979, N35656, N43353...",2,N20678,0,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
3,2019-11-12 18:11:30,U91836,"[N31739, N6072, N63045, N23979, N35656, N43353...",2,N39317,0,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
4,2019-11-12 18:11:30,U91836,"[N31739, N6072, N63045, N23979, N35656, N43353...",2,N58114,0,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...


In [None]:
def preprocess_news(path: str,
                    nrows: Optional[int] = None,
                    sep: str = "\t",
                    parse_json_fields: bool = True,
                    json_fields: Iterable[str] = ('entities','concepts'),
                    extract_entity_labels: bool = True,
                    return_dict: bool = False) -> pd.DataFrame:
    """Load and preprocess a MIND `news.tsv` file.

    Parameters
    - path: path to news.tsv
    - nrows: optional number of rows to read (useful for testing)
    - sep: file separator (default: '\t')
    - parse_json_fields: whether to parse JSON-like fields (entities, concepts)
    - json_fields: iterable of column names that contain JSON arrays
    - extract_entity_labels: if True, create `entity_labels` column with list of Label values
    - return_dict: if True, return dict mapping news_id -> record

    Returns
    - DataFrame (or dict if return_dict=True) with parsed fields and helpful columns.

    Notes
    - JSON fields are parsed using json.loads; falls back to ast.literal_eval on failure.
    """

    # Expected column names in order
    col_names = ['news_id','category','subcategory','title','abstract','url','entities','concepts']

    df = pd.read_csv(path, sep=sep, header=None, names=col_names, nrows=nrows, dtype=str, quoting=3)

    def safe_parse_json_field(s):
        if pd.isna(s) or str(s).strip() == '':
            return []
        s = str(s).strip()
        try:
            return json.loads(s)
        except Exception:
            try:
                return ast.literal_eval(s)
            except Exception:
                # not JSON-like; return as raw string
                return s

    if parse_json_fields:
        for f in json_fields:
            if f in df.columns:
                df[f+'_parsed'] = df[f].apply(safe_parse_json_field)
            else:
                df[f+'_parsed'] = [[] for _ in range(len(df))]

    # Optionally extract entity labels list for quick joins / features
    if parse_json_fields and extract_entity_labels and 'entities_parsed' in df.columns:
        def extract_labels(x):
            if isinstance(x, list):
                labels = []
                for item in x:
                    if isinstance(item, dict):
                        lbl = item.get('Label') or item.get('label')
                        if lbl is not None:
                            labels.append(lbl)
                return labels
            return []
        df['entity_labels'] = df['entities_parsed'].apply(extract_labels)

    # Keep a combined text field for simple text-features if needed
    df['title_abstract'] = df['title'].fillna('') + ' ' + df['abstract'].fillna('')

    if return_dict:
        # Convert to dict keyed by news_id
        recs = {row['news_id']: row for row in df.to_dict('records')}
        return recs

    return df

# Example usage (will be visible in notebook cell output when executed):
# from pathlib import Path
# path = '/workspace/data/MIND_small/MINDsmall_train/news.tsv'
# if Path(path).exists():
#     df_news = preprocess_news(path, nrows=20)
#     display(df_news[['news_id','category','title']].head())
# else:
#     print('news.tsv not found at', path)


In [9]:
# Example usage (will be visible in notebook cell output when executed):

path = '/workspace/data/MIND_small/MINDsmall_train/news.tsv'
if Path(path).exists():
    df_news = preprocess_news(path, nrows=20)
    # Display news id, category, title, abstract, and parsed entity labels
    display(df_news[['news_id','category','title','abstract','entity_labels']].head())
else:
    print('news.tsv not found at', path)


Unnamed: 0,news_id,category,title,abstract,entity_labels
0,N55528,lifestyle,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...","[Prince Philip, Duke of Edinburgh, Charles, Pr..."
1,N19639,health,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,[Adipose tissue]
2,N61837,news,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,[]
3,N53526,health,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",[]
4,N38324,health,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",[Skin tag]
