In [1]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine

In [2]:
# Read the .csv file
df_ford = pd.read_csv("C:\Language_Projects\Language_Projects\Python\Flagship_1\part-number-datasets-cleaning\data\\raw_datasets\\fordR.csv")

  df_ford = pd.read_csv("C:\Language_Projects\Language_Projects\Python\Flagship_1\part-number-datasets-cleaning\data\\raw_datasets\\fordR.csv")


In [3]:
# Show the cols available
df_ford.columns

Index(['Style', 'Hardware Code', 'Part Number', 'Tare Tag', 'Model Year',
       ' Make & Model', 'Engine', 'Trans', 'Other Sticker P/N',
       'Manufacturer Type', 'Ford Type Name', 'VIN Example', 'Note', 'Order',
       'Unnamed: 14'],
      dtype='object')

In [4]:
# Function to fill out null rows
def fill_null_rows(df):
    # Loop to iterate over all cols
    for col in df.columns:
        df[col] = df[col].fillna("Not Available")
    return df

In [5]:
# Function to remove the whitespaces
def remove_whitespaces(df):
    # Loop to iterate over all cols
    for col in df.columns:
        # Remove the writespaces
        df[col] = df[col].str.strip()

    return df

In [6]:
# Function to split multiple part numbers originally in the same line to one row each
def split_part_numbers(df, col):
    df_copy = df.copy()
    # Create a list with the items in each row 
    df_copy[col] = df_copy[col].str.split(",")
    # Explode the items in different rows each and keep the info from the other rows
    df_splitted = df_copy.explode(col)
    # Return the df with the exploded part numbers
    return df_splitted

In [7]:
# Call the function to fill out null values
df_ford_fill_null_values = fill_null_rows(df_ford)

In [8]:
# Print the cols
df_ford_fill_null_values.columns

Index(['Style', 'Hardware Code', 'Part Number', 'Tare Tag', 'Model Year',
       ' Make & Model', 'Engine', 'Trans', 'Other Sticker P/N',
       'Manufacturer Type', 'Ford Type Name', 'VIN Example', 'Note', 'Order',
       'Unnamed: 14'],
      dtype='object')

### Call the function to the explode the following cols:
- Style
- Hardware Code
- Part Number

In [9]:
# Call the function to explode the items in the col Part Number
df_ford_pn_col_exploded = split_part_numbers(df_ford_fill_null_values, "Part Number")

In [10]:
# Call the function to explode the items in the col Hardware Code
df_ford_hdw_code_col_exploded = split_part_numbers(df_ford_pn_col_exploded, 'Hardware Code')

In [11]:
# Call the function to explode the items in the col Style
df_ford_style_col_exploded = split_part_numbers(df_ford_hdw_code_col_exploded, "Style")

In [12]:
# Reset the index
df_ford_reset_index = df_ford_style_col_exploded.reset_index(drop=True)

### Cols to keep:
- Style
- Hardware Code
- Part Number

In [13]:
# List with cols to keep
cols_to_keep = ['Style','Hardware Code', 'Part Number']

In [14]:
# Create a df with needed cols
df_ford_cols_to_keep = df_ford_reset_index[cols_to_keep]

In [15]:
df_ford_cols_to_keep.columns

Index(['Style', 'Hardware Code', 'Part Number'], dtype='object')

In [16]:
# Put the hdw col as the first one
df_ford_cols_hdw_col_first = df_ford_cols_to_keep[[
    'Hardware Code',
    'Style', 
    'Part Number'
]]

In [17]:
# Map to rename Hardware col
map_dict = {
    "Hardware Code": "Hardware"
}

In [18]:
# Rename hardware col
df_ford_rn_hdw_col = df_ford_cols_hdw_col_first.rename(columns=map_dict)

In [19]:
# Check the df info
df_ford_rn_hdw_col.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33239 entries, 0 to 33238
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Hardware     33239 non-null  object
 1   Style        33239 non-null  object
 2   Part Number  33239 non-null  object
dtypes: object(3)
memory usage: 779.2+ KB


## Standardizing Part Number Dataset Columns
Each part number dataset contains columns with different names because they originate from different manufacturers. One of the goals of cleaning these datasets is to create a standard format that allows for the identification of the hardware part number when inputting other part numbers, such as software part numbers or system part numbers.

Since the identification of the hardware part number will be performed by inputting multiple part numbers at once (via a .csv file), it is necessary — except for the hardware column — to rename the other columns to a standard format: `pnNum` (e.g., `pn1`, `pn2`, etc.). This ensures consistency and enables automated matching and processing across datasets from various manufacturers.

In [20]:
def rename_col_names(df):
    '''
    Except the column hardware, the function standardize the names for pnNumber (pn1, pn2).
    Needed due to each part number dataset has different column names, so that 

    Parameters:
    df (DataFrame): main df.
    col_list (list): List of original column names.

    Returns
    df: Return the main df with the renamed cols

    '''

    # List with the col names
    col_name_list = list(df.columns) 
    
    # Dict to append the old names (key): new names (values) 
    new_cols_dict = {}

    # Count the items under the col list
    col_num = len(col_name_list)
    # Loop to iterate over the list
    for col in col_name_list:
        # Condition to define when lower the col name and when rename it
        if col == "Hardware":
            new_cols_dict[col] = col.lower()
        else:
            # Convert the string to number to sum with the stirng pn and
            # append as value to the dict
            new_cols_dict[col] = "pn" + str(col_num)
            # Return the number from string to int
            col_num = int(col_num)
        
        # Subtract one to create the next col name
        num_col = col_num - 1
        col_num = num_col

    # Return the main df with the cols remanes
    return df.rename(columns=new_cols_dict)

In [21]:
# Call the function to rename the cols
df_ford_rn_cols = rename_col_names(df_ford_rn_hdw_col)

In [30]:
# Export to .csv file
df_ford_rn_cols.to_csv("C:\Language_Projects\Language_Projects\Python\Flagship_1\part-number-datasets-cleaning\data\data_cleaned\pn-ford.csv", index=False)

  df_ford_rn_cols.to_csv("C:\Language_Projects\Language_Projects\Python\Flagship_1\part-number-datasets-cleaning\data\data_cleaned\pn-ford.csv", index=False)


In [22]:
def import_datasets_to_db(df, df_name=str):
    
    '''
    Import the df cleaned to the db on postgreSQL

    Parameters: 
        df (DataFrame): main df to import to the postgreSQL db.
        df_name: string to label the df into the db.

    Returns: 
        Import the df to the db.
    '''
    # Setting up the connection with the PostgreSQL
    dbname="prescreen_diag_data_api"
    user="postgres"
    password="shakey-10"
    host="localhost"
    port="5432"

    # String connection for SQLAlchemy (using psycopg2 as driver)
    engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{dbname}")

    return df.to_sql(df_name, engine, if_exists='replace', index=False)

In [23]:
# Call the function to import the df cleaned to the postgreSQLdb
import_datasets_to_db(df_ford_rn_cols, 'part_numbers_ford')

239