## Data Processor

In [1]:
import pandas as pd
import os

In [2]:
def read_queries_from_file(file_path: str):
    try:
        with open(file_path, 'r') as file:
            queries = [line.strip() for line in file if line.strip()]  # Remove empty lines and strip whitespace
        return queries
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' does not exist.")
        return []

In [None]:
file_path = r"data\user_raw_history_data.txt"
# Get the current directory
current_dir = os.getcwd()
print(os.path.join(current_dir,file_path))
queries_list = read_queries_from_file(file_path)
print("quries Size:",len(queries_list))

In [22]:
from pydantic import BaseModel, Field
from typing import Optional
from datetime import datetime

class RawSearchModel(BaseModel):
    query: Optional[str] = Field(None, description="Search query, e.g., 'best noise cancelling headphones 2024'")
    search_date: Optional[str] = Field(None, description="Date of the search in YYYY-MM-DD format")
    device: Optional[str] = Field(None, description="Device used for the search, e.g., 'web'")
    userId: Optional[str] = Field(None, description="User ID, e.g., 'user123'")


In [23]:
# Function to parse the raw query into the Pydantic model
def parse_query(query: str) -> Optional[RawSearchModel]:
    try:
        parts = query.split(",")
        timestamp = parts[0]
        user_id = parts[1]
        device = parts[2]
        search_query = ",".join(parts[3:])  # Handles commas in the query text
        date = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S").date().isoformat()
        return RawSearchModel(
            query=search_query.strip(),
            search_date=date,
            device=device.strip(),
            userId=user_id.strip()
        )
    except Exception as e:
        print(f"Error parsing query: {query}. Error: {e}")
        return None


In [None]:
# Parse all queries
parsed_queries = [parse_query(query) for query in queries_list if query.strip()]

# Convert to a DataFrame for Excel storage
data = [query.dict() for query in parsed_queries if query]
df = pd.DataFrame(data)


# Define the directory path where the file will be saved
output_dir = os.path.join(os.getcwd(), "search_output")
# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Save to Excel
output_path = os.path.join(output_dir, "search_queries.xlsx")
    
df.to_excel(output_path, index=False)

print(f"Data successfully saved to {output_path}")