In [1]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine

In [2]:
# Read the .csv file
df_toyota = pd.read_csv("C:\Language_Projects\Language_Projects\Python\Flagship_1\part-number-datasets-cleaning\data\\raw_datasets\\toyotaR.csv")

  df_toyota = pd.read_csv("C:\Language_Projects\Language_Projects\Python\Flagship_1\part-number-datasets-cleaning\data\\raw_datasets\\toyotaR.csv")


In [3]:
# Map to rename Hardware col
map_dict = {
    "HDW": "hardware"
}

In [4]:
# Rename hdw col
df_mazda_rn_hdw_col = df_toyota.rename(columns=map_dict)

In [5]:
# Check the df info
df_mazda_rn_hdw_col.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5059 entries, 0 to 5058
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Style            3591 non-null   object
 1   hardware         3569 non-null   object
 2   Toyota P/N       3590 non-null   object
 3   Other P/N        2081 non-null   object
 4   Tag              1719 non-null   object
 5   Stkr ENG, Trans  424 non-null    object
 6   Year             3082 non-null   object
 7   Model            3428 non-null   object
 8   Model Code(s)    3400 non-null   object
 9   Description      1511 non-null   object
 10  SecType          3245 non-null   object
 11  Calibrations     464 non-null    object
 12  Example Vin      1135 non-null   object
 13  Prog             414 non-null    object
dtypes: object(14)
memory usage: 553.5+ KB


### Columns to keep
- Style
- hardware
- Toyota P/N
- Other P/N

In [6]:
# Mapping all cols will keep
cols_to_keep = ["hardware", "Style", "Toyota P/N", "Other P/N"]

In [7]:
# Create a df with needed cols
df_toyota_keep_cols = df_mazda_rn_hdw_col[cols_to_keep]

In [9]:
# Remove rows where all cols have null (NaN) values,
# keeping only rows with at least one value (non-null) entry.
df_toyota_wo_all_colsNull = df_toyota_keep_cols[~df_toyota_keep_cols.isnull().all(axis=1)]

In [10]:
# Keep only rows where the cols have > 1 non-null values.
df_toyota_rm_all_nulls = df_toyota_wo_all_colsNull[df_toyota_wo_all_colsNull.notnull().sum(axis=1) != 1]

In [11]:
# Index list of rows filled out with the columns
index_list_repeated_cols = df_toyota_rm_all_nulls[df_toyota_rm_all_nulls['Style'] == 'Style'].index

In [12]:
# Drop rows with col names
df_toyota_dropped_cols = df_toyota_rm_all_nulls.drop(index=index_list_repeated_cols)

In [13]:
# Function to fill out null rows
def fill_null_rows(df):
    # Loop to iterate over all cols
    for col in df.columns:
        df[col] = df[col].fillna("Not Available")
    return df

In [14]:
# Function to remove the whitespaces
def remove_whitespaces(df):
    # Loop to iterate over all cols
    for col in df.columns:
        # Remove the writespaces
        df[col] = df[col].str.strip()

    return df

In [15]:
# Function to split multiple part numbers originally in the same line to one row each
def split_part_numbers(df, col):

    # Call the function to remove the whitespaces
    df_rm_spaces = remove_whitespaces(df)

    df_copy = df_rm_spaces.copy()

    # Create a list with the items in each row 
    df_copy[col] = df_copy[col].str.split(",")
    # Explode the items in different rows each and keep the info from the other rows
    df_splitted = df_copy.explode(col)
    # Return the df with the exploded part numbers
    return df_splitted

In [16]:
# Call the function to fill out null values
df_toyota_fill_nulls = fill_null_rows(df_toyota_dropped_cols)

### Call the function to the explode all items of the cols

In [17]:
df_toyota_expl_hdw_col = split_part_numbers(df_toyota_fill_nulls, 'hardware')

In [18]:
df_toyota_expl_style_col = split_part_numbers(df_toyota_expl_hdw_col, 'Style')

In [19]:
df_toyota_expl_toyotaPn_col = split_part_numbers(df_toyota_expl_style_col, 'Toyota P/N')

In [20]:
df_toyota_expl_otherPn_col = split_part_numbers(df_toyota_expl_toyotaPn_col, 'Other P/N')

In [21]:
# Reset index
df_toyota_reset_index = df_toyota_expl_otherPn_col.reset_index(drop=True)

## Standardizing Part Number Dataset Columns
Each part number dataset contains columns with different names because they originate from different manufacturers. One of the goals of cleaning these datasets is to create a standard format that allows for the identification of the hardware part number when inputting other part numbers, such as software part numbers or system part numbers.

Since the identification of the hardware part number will be performed by inputting multiple part numbers at once (via a .csv file), it is necessary — except for the hardware column — to rename the other columns to a standard format: `pnNum` (e.g., `pn1`, `pn2`, etc.). This ensures consistency and enables automated matching and processing across datasets from various manufacturers.

In [22]:
def rename_col_names(df):
    '''
    Except the column hardware, the function standardize the names for pnNumber (pn1, pn2).
    Needed due to each part number dataset has different column names, so that 

    Parameters:
    df (DataFrame): main df.
    col_list (list): List of original column names.

    Returns
    df: Return the main df with the renamed cols

    '''

    # List with the col names
    col_name_list = list(df.columns) 
    
    # Dict to append the old names (key): new names (values) 
    new_cols_dict = {}

    # Count the items under the col list
    col_num = len(col_name_list)
    # Loop to iterate over the list
    for col in col_name_list:
        # Condition to define when lower the col name and when rename it
        if col == "hardware":
            new_cols_dict[col] = col.lower()
        else:
            # Convert the string to number to sum with the stirng pn and
            # append as value to the dict
            new_cols_dict[col] = "pn" + str(col_num)
            # Return the number from string to int
            col_num = int(col_num)
        
        # Subtract one to create the next col name
        num_col = col_num - 1
        col_num = num_col

    # Return the main df with the cols remanes
    return df.rename(columns=new_cols_dict)

In [23]:
# Call the function to rename the cols
df_toyota_rn_cols = rename_col_names(df_toyota_reset_index)

In [24]:
# Check df info
df_toyota_rn_cols.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3573 entries, 0 to 3572
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   hardware  3573 non-null   object
 1   pn3       3573 non-null   object
 2   pn2       3573 non-null   object
 3   pn1       3573 non-null   object
dtypes: object(4)
memory usage: 111.8+ KB


In [32]:
# Export to .csv file
df_toyota_rn_cols.to_csv("C:\Language_Projects\Language_Projects\Python\Flagship_1\part-number-datasets-cleaning\data\data_cleaned\pn-toyota.csv", index=False)

  df_toyota_rn_cols.to_csv("C:\Language_Projects\Language_Projects\Python\Flagship_1\part-number-datasets-cleaning\data\data_cleaned\pn-toyota.csv", index=False)


In [25]:
def import_datasets_to_db(df, df_name=str):
    
    '''
    Import the df cleaned to the db on postgreSQL

    Parameters: 
        df (DataFrame): main df to import to the postgreSQL db.
        df_name: string to label the df into the db.

    Returns: 
        Import the df to the db.
    '''
    # Setting up the connection with the PostgreSQL
    dbname="prescreen_diag_data_api"
    user="postgres"
    password="shakey-10"
    host="localhost"
    port="5432"

    # String connection for SQLAlchemy (using psycopg2 as driver)
    engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{dbname}")

    return df.to_sql(df_name, engine, if_exists='replace', index=False)

In [26]:
# Call the function to import the df cleaned to the postgreSQLdb
import_datasets_to_db(df_toyota_rn_cols, 'part_numbers_toyota')

573