In [1]:
import polars as pl
import re

def process_data(file_path):
    # Define the schema
    schema = {
        'Name': pl.Utf8,
        'Brand/Manufacturer': pl.Utf8,
        'Part Number': pl.Utf8,
        'MSRP Price': pl.Float32,
        'Quantity': pl.Int8,
        'Fitment': pl.Utf8,
        'Notes': pl.Utf8,
        'Additional Notes': pl.Utf8,
        'Photos': pl.Utf8
    }

    # Read the CSV file with the defined schema
    df = pl.read_csv(file_path, schema_overrides=schema)

    df = df.select([col for col in df.columns if col and not col.startswith('_duplicated_')])

    # Remove rows where all columns are null
    df = df.filter(~pl.fold(pl.lit(True), lambda acc, s: acc & s.is_null(), df.columns))

    df = df.with_columns([
        pl.col('MSRP Price').cast(pl.Float32).round(2).alias('Price'),
        pl.col('Quantity').cast(pl.Int8).alias('Quantity')
    ])

    # Define functions for extracting makes, models, and years
    def split_fitment(fitment, brand):
        if fitment is None or brand is None:
            return None, None, None

        # Define year pattern
        year_pattern = re.compile(r'\b(20\d{2})(?:-(20\d{2}))?\b')

        # Split the input into lines and then further split each line into parts
        fitment_lines = fitment.split('\n')
        models = []
        year_ranges = []

        for line in fitment_lines:
            # Extract years from each line
            line_years = year_pattern.findall(line)
            if line_years:
                line_year_ranges = ['-'.join(filter(None, match)) for match in line_years]
                year_ranges.extend(line_year_ranges)

            # Remove year parts from the line
            line = year_pattern.sub('', line).strip()

            # Remove the brand from the line if it exists
            if brand.lower() in line.lower():
                line = line.replace(brand, '').strip()

            models.append(line.strip())

        # Clean and join models
        models = [model.replace('-', '').strip() for model in models if model]
        models = ', '.join(filter(None, models))

        # Join the years with commas if there are multiple ranges
        year_ranges = ', '.join(filter(None, year_ranges)) if year_ranges else None

        return brand, models, year_ranges
    

    def expand_year_range(year_range):
        if '-' in year_range:
            start_year, end_year = map(int, year_range.split('-'))
            return [str(year) for year in range(start_year, end_year + 1)]
        else:
            return [year_range]
    
    def adjust_model_year(model, year, additional_notes):
        if model is None and year is None:
            model = 'Various'
            year = 'Various'
            if additional_notes is None:
                additional_notes = 'Check fitment information online to ensure correct fit'
            else:
                additional_notes += '\nCheck fitment information online to ensure correct fit'
        elif model is None:
            model = 'Various'
            if additional_notes is None:
                additional_notes = 'Check fitment information online to ensure correct fit'
            else:
                additional_notes += '\nCheck fitment information online to ensure correct fit'
        elif year is None:
            year = 'Various'
            if additional_notes is None:
                additional_notes = 'Check fitment information online to ensure correct fit'
            else:
                additional_notes += '\nCheck fitment information online to ensure correct fit'
        return model, year, additional_notes


    def combine_notes(notes, name, additional_notes):
        if notes is None:
            notes = f'{name}'
        if additional_notes is not None:
            return f"{notes} {name} {additional_notes}".strip()
        return f'{notes} {name}'.strip()

    def to_sentence_case(text):
        if text is None:
            return None
        return text.capitalize()

    def to_title_case(text):
        if text is None:
            return None
        return text.title()

    # Apply functions using Polars expressions
    df = df.with_columns([
        pl.col("Name").map_elements(to_sentence_case, return_dtype=pl.Utf8).alias("Name"),
        pl.col("Brand/Manufacturer").map_elements(to_title_case, return_dtype=pl.Utf8).alias("Brand"),
        pl.struct(["Fitment", "Brand/Manufacturer"]).map_elements(lambda x: split_fitment(x["Fitment"], x["Brand/Manufacturer"])[0], return_dtype=pl.Utf8).alias("Make"),
        pl.struct(["Fitment", "Brand/Manufacturer"]).map_elements(lambda x: split_fitment(x["Fitment"], x["Brand/Manufacturer"])[1], return_dtype=pl.Utf8).alias("Model"),
        pl.struct(["Fitment", "Brand/Manufacturer"]).map_elements(lambda x: split_fitment(x["Fitment"], x["Brand/Manufacturer"])[2], return_dtype=pl.Utf8).alias("Year"),
        pl.struct(["Notes", "Name", "Additional Notes"]).map_elements(lambda x: combine_notes(x["Notes"], x["Name"], x["Additional Notes"]), return_dtype=pl.Utf8).alias("Description")
    ])

    df = df.with_columns([
        pl.struct(["Model", "Year", "Additional Notes"]).map_elements(lambda x: adjust_model_year(x["Model"], x["Year"], x["Additional Notes"])[0], return_dtype=pl.Utf8).alias("Model"),
        pl.struct(["Model", "Year", "Additional Notes"]).map_elements(lambda x: adjust_model_year(x["Model"], x["Year"], x["Additional Notes"])[1], return_dtype=pl.Utf8).alias("Year"),
        pl.struct(["Model", "Year", "Additional Notes"]).map_elements(lambda x: adjust_model_year(x["Model"], x["Year"], x["Additional Notes"])[2], return_dtype=pl.Utf8).alias("Additional Notes")
    ])

    df = df.with_columns(
        pl.col('Model').map_elements(to_title_case, return_dtype=pl.Utf8).alias('Model'),
    )

    df = df.filter(
    (pl.col('Part Number') != "") & 
    (pl.col('Part Number').is_not_null()) & 
    (pl.col('Part Number') != "Unknown")
)
    rows = []
    for row in df.iter_rows(named=True):
        models = row['Model'].split(', ')
        years = row['Year'].split(', ')
        for model in models:
            for year_range in years:
                for year in expand_year_range(year_range):
                    new_row = row.copy()
                    new_row['Model'] = model
                    new_row['Year'] = year
                    rows.append(new_row)

    normalized_df = pl.DataFrame(rows)

    final_df = normalized_df.unique()

    standardized_columns = {col: col.lower().replace(' ', '_').replace('/', '_') for col in final_df.columns}
    standardized_df = final_df.rename(standardized_columns)

    # Select the standardized columns
    standardized_df = standardized_df.select([
        "part_number",
        "name",
        "brand",
        "price",
        "quantity",
        "make",
        "model",
        "year",
        "description",
        "photos"
    ])

    return standardized_df


#still need to remove complete duplicates
# Usage
file_path = '/Users/skylerwilson/Desktop/PartsMatch/OEM_trailer_parts_fitment.csv'
standardized_df = process_data(file_path)
standardized_df.write_csv('/Users/skylerwilson/Desktop/PartsMatch/OEM_trailer_parts_fitment_edited.csv')
standardized_df

part_number,name,brand,price,quantity,make,model,year,description,photos
str,str,str,f64,i64,str,str,str,str,str
"""48710822A""","""Ducati oem multistrada windscr…","""Ducati""",333.98999,1,"""Ducati""","""1200 S Touring Brasil""","""2019""","""New Ducati OEM Multistrada Win…",
"""46628532317""","""Bmw r1200 gs liscence plate ho…","""Bmw""",79.989998,1,"""BMW""","""Premium Abs""","""2021""","""BMW R1200 GS Liscence Plate Ho…",
"""82718841AA""","""Ducati lh reflector support pl…","""Ducati""",15.65,1,"""Ducati""","""Panigale V4 Sp2 30∞ Anniversar…","""2008""","""New, OEM stickers Ducati LH Re…",
"""77214031A""","""Ducati hypermotorad screw, spe…","""Ducati""",8.42,1,"""Ducati""","""Xdiavel Standard Standard	""","""2021""","""New in Origional Box with OE s…",
"""2879769-266""","""Fuel tank, cruiser black, indi…","""Polaris/Indian""",799.98999,1,"""Polaris/Indian""","""Indian Pursuit Dark Horse Icon…","""2014""","""New in Origional Box Fuel Tank…",
…,…,…,…,…,…,…,…,…,…
"""57322212A""","""2016 ducati monster oem 821 ex…","""Ducati""",399.98999,1,"""Ducati""","""Monster 821 All Types""","""2016""","""takeoff 2016 Ducati Monster OE…","""https://lh3.googleusercontent.…"
"""59522411B""","""Ducati scrambler seat""","""Ducati""",300.079987,1,"""Ducati""","""Scrambler Icon 800""","""2020""","""New Takeoff Ducati Scrambler S…","""https://photos.fife.userconten…"
"""2879769-266""","""Fuel tank, cruiser black, indi…","""Polaris/Indian""",799.98999,1,"""Polaris/Indian""","""Indian Pursuit Elite""","""2014""","""New in Origional Box Fuel Tank…",
"""18127712864""","""Bmw rear muffler""","""Bmw""",891.080017,1,"""BMW""",""" R1200 Gs Adventure 10""","""2008""","""Used takeoff BMW Rear Muffler""",


In [2]:
import polars as pl
from sqlalchemy import create_engine, Column, Integer, String, ForeignKey, Float, Integer
from sqlalchemy.orm import sessionmaker, relationship, declarative_base

# Define SQLAlchemy base
Base = declarative_base()

# Define the Parts table
class Part(Base):
    __tablename__ = 'parts'
    part_number = Column(String, primary_key=True, unique=True)
    price = Column(Float)
    quantity = Column(Integer)
    brand = Column(String)
    description = Column(String, nullable=True)
    photos = relationship("Photo", back_populates="part")

# Define the Models table
class Model(Base):
    __tablename__ = 'models'
    id = Column(Integer, primary_key=True, autoincrement=True)
    model_name = Column(String, unique=True)

# Define the Years table
class Year(Base):
    __tablename__ = 'years'
    id = Column(Integer, primary_key=True, autoincrement=True)
    year = Column(Integer, unique=True)

class Photo(Base):
    __tablename__ = 'photos'
    id = Column(Integer, primary_key=True, autoincrement=True)
    url = Column(String, nullable=False)
    part_number = Column(Integer, ForeignKey('parts.part_number'), nullable=False)
    part = relationship("Part", back_populates="photos")

# Define the ModelYear junction table
class ModelYear(Base):
    __tablename__ = 'model_year'
    id = Column(Integer, primary_key=True, autoincrement=True)
    part_number = Column(String, ForeignKey('parts.part_number'))
    model_id = Column(Integer, ForeignKey('models.id'), nullable=True)
    year_id = Column(Integer, ForeignKey('years.id'), nullable=True)



In [4]:
# Create an SQLite database (or any other database you prefer)
db_dir = '/Users/skylerwilson/Desktop/PartsWise/co-pilot-v1/data/databases'
engine = create_engine(f'sqlite:///{db_dir}/parts_database.db')

# Drop all tables
Base.metadata.drop_all(engine)

# Create all tables
Base.metadata.create_all(engine)

# Create a session
Session = sessionmaker(bind=engine)
session = Session()

# Define the schema
schema = {
    'part_number': pl.Utf8,
    'name': pl.Utf8,
    'brand': pl.Utf8,
    'price': pl.Float32,
    'quantity': pl.Int8,
    'make': pl.Utf8,
    'model': pl.Utf8,
    'year': pl.Utf8,
    'description': pl.Utf8,
    'photos': pl.Utf8,
}

# Assuming df is your original DataFrame with duplicates
df = pl.read_csv('/Users/skylerwilson/Desktop/PartsMatch/OEM_trailer_parts_fitment_edited.csv', schema_overrides=schema)

# Extract unique parts
parts_df = df.unique(subset=['part_number'])
parts_df = parts_df.select(['part_number', 'brand', 'quantity', 'price', 'description', 'photos'])

# Convert parts_df to a list of dictionaries
parts_data = parts_df.to_dicts()

# Insert parts into the database
for part in parts_data:
    new_part = Part(
        part_number=part['part_number'],
        brand=part.get('brand'),
        description=part.get('description'),
        price=part.get('price'),
        quantity=part.get('quantity')
    )
    session.add(new_part)
    session.flush()  # This assigns an ID to new_part

    # Handle photos
    if part.get('photos'):
        photo_urls = part['photos'].split(',')  # Assuming photos are comma-separated
        for url in photo_urls:
            new_photo = Photo(url=url.strip(), part_number=new_part.part_number)
            session.add(new_photo)

session.commit()

# Extract unique models
models_df = df.select("model").unique()
models_data = models_df.to_pandas().to_dict(orient='records')

# Insert models into the database
for model in models_data:
    new_model = Model(model_name=model['model'])
    session.add(new_model)
session.commit()

# Extract unique years
years_df = df.select("year").unique()
years_data = years_df.to_pandas().to_dict(orient='records')

# Insert years into the database
for year in years_data:
    new_year = Year(year=year['year'])
    session.add(new_year)
session.commit()

# Create ModelYear entries
for row in df.to_dicts():
    part_number = row['part_number']
    model_name = row.get('model')
    year_value = row.get('year')

    # Retrieve model_id if model_name is available
    if model_name:
        model_id = session.query(Model.id).filter_by(model_name=model_name).first()
        model_id = model_id[0] if model_id else None
    else:
        model_id = None

    # Retrieve year_id if year_value is available
    if year_value:
        year_id = session.query(Year.id).filter_by(year=year_value).first()
        year_id = year_id[0] if year_id else None
    else:
        year_id = None

    new_model_year = ModelYear(part_number=part_number, model_id=model_id, year_id=year_id)
    session.add(new_model_year)

session.commit()

# Close the session
session.close()


In [6]:
from sqlalchemy import create_engine, inspect

db_dir = '/Users/skylerwilson/Desktop/PartsWise/co-pilot-v1/data/databases'
engine = create_engine(f'sqlite:///{db_dir}/parts_database.db')

# Create an inspector
inspector = inspect(engine)

# Get the list of table names
tables = inspector.get_table_names()

# Print the structure of each table
for table in tables:
    print(f"\nTable: {table}")
    columns = inspector.get_columns(table)
    for column in columns:
        print(f"  Column: {column['name']} - {column['type']}")

# Additionally, print foreign key information
for table in tables:
    print(f"\nForeign Keys for Table: {table}")
    foreign_keys = inspector.get_foreign_keys(table)
    for fk in foreign_keys:
        print(f"  Foreign Key: {fk['constrained_columns']} -> {fk['referred_table']}.{fk['referred_columns']}")



Table: model_year
  Column: id - INTEGER
  Column: part_number - VARCHAR
  Column: model_id - INTEGER
  Column: year_id - INTEGER

Table: models
  Column: id - INTEGER
  Column: model_name - VARCHAR

Table: parts
  Column: part_number - VARCHAR
  Column: price - FLOAT
  Column: quantity - INTEGER
  Column: brand - VARCHAR
  Column: description - VARCHAR

Table: photos
  Column: id - INTEGER
  Column: url - VARCHAR
  Column: part_number - INTEGER

Table: years
  Column: id - INTEGER
  Column: year - INTEGER

Foreign Keys for Table: model_year
  Foreign Key: ['part_number'] -> parts.['part_number']
  Foreign Key: ['model_id'] -> models.['id']
  Foreign Key: ['year_id'] -> years.['id']

Foreign Keys for Table: models

Foreign Keys for Table: parts

Foreign Keys for Table: photos
  Foreign Key: ['part_number'] -> parts.['part_number']

Foreign Keys for Table: years
