In [1]:
import pdfplumber
import pandas as pd
import re
import os
from dateparser.search import search_dates

In [2]:
# Store the .pdf file dir into a var
dir = '../data'

In [3]:
# Iterate under dir data to have all files 
for filename in os.listdir(dir):
    # Condition to confirm which file has .pdf extension
    if filename.endswith('.pdf'):
        # Join the dir + filename when .pdf extension
        pdf_file_fullpath = os.path.join(dir, filename)
        # Print the result
        print(pdf_file_fullpath)

../data\part-number-list-2025-08-08.pdf


In [4]:
# Read the pdf document
with pdfplumber.open(pdf_file_fullpath) as pdf:
    # Get the page number
    page = pdf.pages[0]
    # Extract the table (s) 
    table = page.extract_table()

In [5]:
# Get the type
type(page)

pdfplumber.page.Page

In [6]:
# Show the data type of the method pages to find out an method iterable to print all pages
# The data type is a list with all pages, which is possible to interate over this method to extract the table
type(pdf.pages)

list

In [7]:
# Check the data type of the data extracted
type(table)

list

In [8]:
# The data extracted is saved into a list of lists
# Print the first item of the list
# Position 0 at the table are the column names
print(table[0])

['CALIBRATION', 'TYPE', 'OLD PART NUMBER(S)', 'NEW PART NUMBER(s)', 'TSB(S)', 'RECALL(S)']


In [9]:
# List to store all data from all pages, otherwise when iterating over each line (WITHOUT THE LIST), 
# the previous page is removed and the new page is stored in the variable 'table_extracted', 
# that's why only the last 2 lines appears in the final resultWITHOUT LIST  
all_tables = []

# Loop to iterate over the pdf.pages list
for page in pdf.pages:
    # Extract the data from each page
    table_extracted = page.extract_table()
    all_tables.extend(table_extracted)

In [10]:
# Convert the list to dataframe
df = pd.DataFrame(all_tables[1:], columns=all_tables[0])

In [11]:
# Check the data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17093 entries, 0 to 17092
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   CALIBRATION         17093 non-null  object
 1   TYPE                17093 non-null  object
 2   OLD PART NUMBER(S)  17093 non-null  object
 3   NEW PART NUMBER(s)  17093 non-null  object
 4   TSB(S)              17093 non-null  object
 5   RECALL(S)           17093 non-null  object
dtypes: object(6)
memory usage: 801.4+ KB


In [12]:
# Check what rows the str 'CALIBRATION' is located
ser_calibration = df['CALIBRATION'].str.contains('CALIBRATION')

In [13]:
# Get the indexes with the rows containing the str 'CALIBRATION'
index_true = ser_calibration[ser_calibration == True].index

In [14]:
# Drop all indexes with the header 'CALIBRATION', 'TYPE', 'OLD PART NUMBER(S)', 'NEW PART NUMBER(s)', 'TSB(S)', 'RECALL(S)
df_calibration_deleted = df.drop(index=index_true)

In [15]:
# Check the columns available
df_calibration_deleted.columns

Index(['CALIBRATION', 'TYPE', 'OLD PART NUMBER(S)', 'NEW PART NUMBER(s)',
       'TSB(S)', 'RECALL(S)'],
      dtype='object')

In [466]:
# Drop the 2 last columns
df_dropped_columns = df_calibration_deleted.drop(columns=['TSB(S)', 'RECALL(S)'])

In [None]:
# Regex pattern for YEAR:
# ^ - make sure the match happens ONLY at the beggining of the string
# \d{2,4} - between 2 and 4 digits
# (?:...) - group does not capture avoid tuples
# \s+ - with or more whitspaces
# * - 0 or more repetitions
year_pattern = r'(?:^|ONLY#\s*|AND\s*)((?:(?:19|20)\d{2}|9[1-9])(?:\s+(?:(?:19|20)\d{2}|9[1-9]))*|(?:19|20)\d{2}\s*-\s*(?:19|20)\d{2})'

In [None]:
# Regex pattern for BODY CODE:
# ^ - at the beginning of the string
# \n - literal newline
# [A-Z0-9] - upper letter and number regardless the order
# {2} - capture exactly 2 letters
# \s+ - \s whitespace + means more than one whitespace
# - - Hifen literal
body_code_pattern = r'^(?:\d{2,4}\s+)+(?:[\d.]+L?\s+)?([A-Z][A-Z0-9])'

In [None]:
# # Regex pattern for ENGINE SIZE ('2.5L' or '3.6'):
# [0-9] - first digit
# \.[0-9] - period (.) literal + any digit
# L? - WITH or WITHOUT L
engine_size_pattern = r'\d\.\d+L'

In [375]:
def extract_year(df, pattern, replace_dict=None):
    '''Function to capture the data using the stabilished 
    pattern for year extraction'''

    # Extract years returning a series
    ser_year_extracted = df['CALIBRATION'].str.extract(pattern)[0]

    # Update the main df with the data extracted only where the row is na
    df.loc[df['Year'].isna(), 'Year'] = ser_year_extracted

    # Since replace method will not be used everytime,
    # replace only when there are values for replacing
    if replace_dict is not None:
        df['Year'] = df['Year'].replace(replace_dict)

    # Return the df
    return df

### Year: 95

In [467]:
# Adding column year to the df
df_dropped_columns['Year'] = None

In [468]:
pattern_95 = r'(^95(?=\s\d.))'

In [469]:
# Dict to the correct year format
dict_95 = {
    "95": "1995"
}

In [470]:
df_extracted_95 = extract_year(df_dropped_columns, pattern_95, dict_95)

### Year: 96

In [471]:
pattern_96 = r'(^96(?=\s[A-Z0-9]))'

In [472]:
# Dict to the correct year format
dict_96 = {
    "96": "1996"
}

In [473]:
df_extracted_96 = extract_year(df_extracted_95, pattern_96, dict_96)

### Year: 97

In [474]:
pattern_97 = r'(^97(?=\s[A-Z0-9(]))'

In [475]:
# Dict to the correct year format
dict_97 = {
    "97": "1997"
}

In [476]:
df_extracted_97 = extract_year(df_extracted_96, pattern_97, dict_97)

### Year: 1997

In [477]:
pattern_1997 = r'(^1997(?=\s[A-Z]))'

In [478]:
df_extracted_1997 = extract_year(df_extracted_97, pattern_1997)

### Year: 98

In [479]:
pattern_98 = r'(^98(?=\s[A-Z0-9]))'

In [480]:
# Dict to the correct year format
dict_98 = {
    "98": "1998"
}

In [481]:
df_extracted_98 = extract_year(df_extracted_1997, pattern_98, dict_98)

### Year: 98/99

In [482]:
pattern_98_99 = r'(\d{2}/\d{2})'

In [483]:
# Dict to the correct year format
dict_98_99 = {
    "98/99": "1998, 1999"
}

In [484]:
df_extracted_98_99 = extract_year(df_extracted_98, pattern_98_99, dict_98_99)

### Year: 99

In [409]:
pattern_99 = r'(^99(?=\s[A-Z0-9]))'

In [485]:
# Dict to the correct year format
dict_99 = {
    "99": "1999"
}

In [486]:
df_extracted_99 = extract_year(df_extracted_98_99, pattern_99, dict_99)

### Year: 99-2000

In [487]:
pattern_99_2000 = r'(^99-2000(?=\s[A-Z]))'

In [488]:
# Dict to the correct year format
dict_99_2000 = {
    "99-2000": "1999, 2000"
}

In [489]:
df_extracted_99_2000 = extract_year(df_extracted_99, pattern_99_2000, dict_99_2000)

### Year: 2000, 01

In [490]:
pattern_2000_01 = r'(^2000, 01(?=\s[A-Z]))'

In [491]:
# Dict to the correct year format
dict_2000_01 = {
    "2000, 01": "2000, 2001"
}

In [492]:
df_extracted_2000_01 = extract_year(df_extracted_99_2000, pattern_2000_01, dict_2000_01)

### Year: 2000

In [493]:
pattern_2000 = r'(^2000(?=\s[A-Z0-9]))'

In [494]:
df_extracted_2000 = extract_year(df_extracted_2000_01, pattern_2000)

### Year: 2001

In [495]:
pattern_2001 = r'(^2001(?=\s[A-Z0-9]))'

In [502]:
df_extracted_2001 = extract_year(df_extracted_2000, pattern_2001)

### Year: 2001 (2000.5) ## NOT CAPTURED

In [557]:
pattern_2001_2000_5 = r'(^2001((2000.5))(?=\s[A-Z/]+))'

In [514]:
dict_2001_2000_5 = {
    "2001 (2000.5)": "2000, 2001"
}

In [558]:
df_extracted_2001_2000_5 = extract_year(df_extracted_2001, pattern_2001_2000_5)

### Year: 2002

In [559]:
pattern_2002 = r'(^2002(?=\s[A-Z0-9]))'

In [560]:
df_extracted_2002 = extract_year(df_extracted_2001_2000_5, pattern_2002)

### Year: 2002 - 2004

In [572]:
pattern_2002_2004 = r'(^2002 - 2004(?=\s[A-Z]))'

In [576]:
dict_2002_2004 = {
    "2002 - 2004": "2002, 2003, 2004"
}

In [577]:
df_extracted_2002_2004 = extract_year(df_extracted_2002, pattern_2002_2004, dict_2002_2004)

### Year: 2003 2004

In [579]:
pattern_2003_2004 = r'(^2003 2004(?=\s[A-Z]))'

In [580]:
dict_2003_2004 = {
    "2003 2004": "2003, 2004"
}

In [None]:
df_extracted_2003_2004 = extract_year(df_extracted_2002_2004, pattern_2003_2004, dict_2003_2004)

### Year: 2003

In [585]:
pattern_2003 = r'(^2003(?=\s[A-Z0-9]))'

In [586]:
df_extracted_2003 = extract_year(df_extracted_2003_2004, pattern_2003)

### Year: 2003-2004

In [590]:
pattern_2003_2004 = r'(^2003-2004(?=\s[A-Z0-9]))'

In [591]:
dict_2003_2004 = {
    '2003-2004': '2003, 2004'    
}

In [594]:
df_extracted_2003_2004 = extract_year(df_extracted_2003, pattern_2003_2004, dict_2003_2004)

In [603]:
pattern_2003_5 = r'(^2003.5(?=\s[A-Z0-9]))'

In [604]:
df_extracted_2003_5 = extract_year(df_extracted_2003_2004, pattern_2003_5)

### Year: 2004

In [611]:
pattern_2004 = r'(^2004(?=\s[A-Z]))'

In [612]:
df_extracted_2004 = extract_year(df_extracted_2003_5, pattern_2004)

### Year: 2004 - 2006

In [614]:
pattern_2004_2006 = r'(^2004 - 2006(?=\s[A-Z]))'

In [618]:
dict_2004_2006 = {
    "2004 - 2006": "2004, 2005, 2006"    
}

In [619]:
df_extracted_2004_2006 = extract_year(df_extracted_2004, pattern_2004_2006, dict_2004_2006)

### Year: 2004.5

In [627]:
pattern_2004_5 = r'(^2004.5(?=\s[A-Z0-9]))'

In [628]:
df_extracted_2004_5 = extract_year(df_extracted_2004_2006, pattern_2004_5)

### Year: 2005

In [636]:
pattern_2005 = r'(^2005(?=\s[A-Z]))'

In [637]:
df_extracted_2005 = extract_year(df_extracted_2004_5, pattern_2005)

### Year: 2005 2006

In [649]:
pattern_2005_ws_2006 = r'(^2005 2006(?=\s[A-Z]))'

In [650]:
dict_2005_ws_2006 = {
    "2005 2006": "2005, 2006"    
}

In [651]:
df_extracted_2005_h_2006 = extract_year(df_extracted_2005, pattern_2005_ws_2006, dict_2005_ws_2006)

### Year: 2005 - 2007

In [652]:
pattern_2005_2007 = r'(^2005 - 2007(?=\s[A-Z]))'

In [653]:
dict_2005_2007 = {
    "2005 - 2007": "2005, 2006, 2007"    
}

In [656]:
df_extracted_2005_2007 = extract_year(df_extracted_2005_h_2006, pattern_2005_2007, dict_2005_2007)

### Year: 2005 - 2006

In [658]:
pattern_2005_h_2006 = r'(^2005 - 2006(?=\s[A-Z]))'

In [663]:
dict_2005_h_2006 = {
    "2005 - 2006": "2005, 2006"    
}

In [664]:
df_extracted_2005_h_2006 = extract_year(df_extracted_2005_2007, pattern_2005_h_2006, dict_2005_h_2006)

### Year: 2005.5

In [666]:
pattern_2005_5 = r'(^2005.5(?=\s[A-Z]))'

In [667]:
df_extracted_2005_5 = extract_year(df_extracted_2005_h_2006, pattern_2005_5)

In [669]:
df_extracted_2005_5[df_extracted_2005_5['Year'].str.contains("2005") == True]

Unnamed: 0,CALIBRATION,TYPE,OLD PART NUMBER(S),NEW PART NUMBER(s),Year
115,2005 LX/WK EGS52 TCM\nLX - 300 / MAGNUM / CHARGER,TCM,04692390AA\n04692390AB\n04692390AC\n04692390AD...,04692390AI,2005
116,2005 2006 LX WK WH SRT-8 EGS52 TCM\nLX - 300 /...,TCM,04692392AA\n04692392AB\n04692392AC\n04692392AD...,04692392AK,"2005, 2006"
167,2005 - 2007 RG EATXIIIB\nR2 - CARAVAN / VOYAGE...,TCM,04727535AB\n04727535AC\n04727535AD\n04727535AE,04727535AF,"2005, 2006, 2007"
174,2005 RG 2.8L AUTO FLAT FLOOR\nRG - VOYAGER (GRAZ),ECM,04727770AA\n04727770AB\n04727770AC\n04727770AD...,04727770AF,2005
195,2005 RG 3.3L AUTO JAPAN EARLY\nRG - VOYAGER (G...,PCM,04748430AA\n04748430AB\n04748430AC\n04748430AD...,04748430AE,2005
...,...,...,...,...,...
5515,2005 WK 4.7L 2WD\nWK - GRAND CHEROKEE,PCM,56044515AE\n56044551AG\n56044551AH\n56044551AI...,68055272AA,2005
5516,2005 WK 4.7L 4WD\nWK - GRAND CHEROKEE,PCM,56044515AE\n56044552AH\n56044552AI\n56044552AJ...,68055273AA,2005
6122,2005 KJ 2.8L AUTO BUX WITH METALIC GLOW\nPLUG\...,ECM,56044561BF\n56044561CA\n56044561CB\n56044561CC...,68090465AA,2005
6123,2005 KJ 2.8L MAN BUX WITH METALIC GLOW\nPLUG\n...,ECM,56044558BF\n56044558CA\n56044558CB\n56044558CC...,68090470AA,2005


In [None]:
# Print the head to see the results
df_y_extracted.head(5)

In [None]:
# Include ', ' between year whitespaces
df_year_column = df_y_extracted['Year'].str.replace(' ', ', ', regex=False)

In [None]:
# Concat column updated
df_y_extracted['Year'] = df_year_column

In [None]:
# Print the head to see the changes
df_y_extracted.head(5)

In [None]:
def get_list_of_years(df):
    # List to store the updated years
    processed_years = []

    # Iterate under column year to add 19 to 2 digits years
    for year in df['Year']:
        # Confirm if the year lenght is 2 digits
        if len(year) == 2:
            processed_years.append('19' + year)
        else:
            processed_years.append(year)

    return processed_years

In [None]:
# Call the function to add '19' to the years with 2 digits
years_updated_list = get_list_of_years(df_y_extracted)

In [None]:
# Update the df
df_y_extracted['Year'] = years_updated_list

In [None]:
# Convert the year_list to a unique string, 
# split method converts the unique string to a list of strings separated by ', '
list_of_years = ', '.join(years_updated_list).split(', ')

In [None]:
# Iterator under the list to confirm what years is different than 4 digits
for year in list_of_years:
    if len(year) != 4:
        print(year)

In [None]:
# Check which rows have year 2018, 
ser_incorrect_year = df_y_extracted['Year'].str.contains('2018,')

In [None]:
# Get the indexes
ser_incorrect_year[ser_incorrect_year == True]

In [None]:
# Print the row with the years to see the problem
df_y_extracted['Year'][3826]

In [None]:
# Check how it is on the origin row
df_y_extracted['CALIBRATION'][3826]
df_y_extracted['CALIBRATION'][3829]

In [None]:
# Replace 2 commas by only 1 comma 
df_y_extracted.loc[[3826, 3829], 'Year'] = df_y_extracted.loc[[3826, 3829], 'Year'].str.replace(',, ', ', ', regex=False)

### YEAR CHECKING

In [None]:
# Get a list with the years
list_of_years_after_correction = get_list_of_years(df_y_extracted)

In [None]:
# Create a list with years
list_of_updated_years = ', '.join(list_of_years_after_correction).split(', ')

In [None]:
# Iterator under the list to confirm what years is different than 4 digits
for year in list_of_updated_years:
    if len(year) != 4:
        print(year)

In [None]:
# Convert the list to df
df_years = pd.DataFrame(list_of_updated_years)

In [None]:
# Get the indexes that show empty at column 0 (year column)
df_years[df_years[0] == '']

In [None]:
# Update the empty rows with values to convert all years to int
df_years.loc[[3542, 3547, 3551], 0] = df_years.loc[[3542, 3547, 3551], 0].str.replace('', '0') 

In [None]:
# List with years converted to int
years_converted_to_int_list = []

# Convert the column 0 to int
for year in df_years[0]:
    # int method to convert all data to int values
    int_year = int(year)
    years_converted_to_int_list.append(int_year)

In [None]:
# Sort the list
years_converted_to_int_list.sort()

In [None]:
# Use the method set to get a list with UNIQUE YEARS to check if there are incorrect years.
print(set(years_converted_to_int_list))

### BODY CODE EXTRACTION

In [None]:
# First body code capture, after line break '\n'
# \n - line break
# [A-Z0-9] - upper letter followed by number
# {2} - 2 digits
# \s+ - 1 or more whitespaces
body_code_pattern_after_line_break = r'\n([A-Z0-9]{2})\s+-'

In [None]:
def extract_body_code(df, new_column, column, pattern):
    '''Function to capture the data using the stabilished 
    pattern for year, body code and engine size'''
    df[new_column] = df[column].str.findall(pattern, re.MULTILINE)

    # findall method extract the desired data from each row and place it in a new row with list form.
    # .str.join method converts the list to a unique string: 
    # [2001, 2002] -> 2001, 2002 (using the separator ', ') 
    df[new_column] = df[new_column].str.join(', ')

    # Return the df after data extraction
    return df

In [None]:
# Extract the body code
df_body_code_extracted_pattern_after_line_break = extract_data(df_y_extracted, 'Body', 'CALIBRATION', body_code_pattern_after_line_break)

In [None]:
# Print the head to see the result
df_body_code_extracted_pattern_after_line_break.head()

In [None]:
# Body code capture after line break '\n'
# \n - line break
# ([A-Z0-9]{2}) - group of capture to the body code, 2 digits of letters or numbers
# \s-\s - capture exactly whitespace - whitespace
# ([^\n]+) - capture the entire text after - until the next line break
body_code_pattern_descr = r'\n([A-Z0-9]{2}\s-\s[^\n]+)'

In [None]:
# Call the function to extract the body code and all data after the body code
df_body_code_extracted_pattern_after_line_break['BC Description'] = df_body_code_extracted_pattern_after_line_break['CALIBRATION'].str.findall(body_code_pattern_descr, re.MULTILINE)

In [None]:
# findall method capture all body code descriptions and return all of them info a list.
# This method extract all from the list to be updated in the same column as strings out of list
# join(', ') method converts the strings INSIDE A LIST IN EACH ROW to a unique string IN EACH ROW  
df_body_code_extracted_pattern_after_line_break['BC Description'] = df_body_code_extracted_pattern_after_line_break['BC Description'].str.join(', ')

In [None]:
# Filter the empty rows and save in a variable to check for next pattern
df_bc_and_bc_desc = df_body_code_extracted_pattern_after_line_break[df_body_code_extracted_pattern_after_line_break['Body'] == '']

In [None]:
# Check the size of the df with the body codes captured
len(list(df_body_code_extracted_pattern_after_line_break.index))

In [None]:
# Counting the remaining rows to capture the body code
len(list(df_bc_and_bc_desc.index))

In [None]:
# Pattern to capture the body codes at the beginning of the string
# (? - find this group but does not capture
# <= - lookbehind, check if the pattern is before but does not capture
# \d{4}\s) - 4 digits and 1 whitespace
# (?:\s[A-Z0-9]{2})* - capture 0 or more body codes all followed by whitespaces
# \b - make sure finishes at the end of the last body code
body_code_pattern_beginning_string = r'(?<=\d{4}\s)([A-Z0-9]{2}(?:\s[A-Z0-9]{2})*)\b'

In [None]:
# Call the function to extract the remainig body codes
df_body_code_beginning_string = extract_data(df_bc_and_bc_desc, 'BC', 'CALIBRATION', body_code_pattern_beginning_string)

In [None]:
# Drop column body
df_body_code_beginning_string_dropped_column = df_body_code_beginning_string.drop(columns=['Body'])

In [None]:
# Rename the column from BC to Body
df_body_code_beginning_string_renamed_column = df_body_code_beginning_string_dropped_column.rename(columns={'BC': 'Body'})

In [None]:
# Replace ' ' between thebody codes for ', '
df_body_code_beginning_string_renamed_column['Body'] = df_body_code_beginning_string_renamed_column['Body'].str.replace(' ', ', ', regex=False) 

In [None]:
# Change the column order to match the main df
df_body_code_beginning_string_renamed_column = df_body_code_beginning_string_renamed_column[['CALIBRATION', 'TYPE', 'OLD PART NUMBER(S)', 'NEW PART NUMBER(s)', 'Year', 'Body', 'BC Description']]

In [None]:
# Merge the main df with the df with the extracted codes at the beginning of the string
# left_index, right_index - the index will be the key match between both dataframes, such as a match ID on both dfs
# outer: use union of keys from both frames
dfs_body_code_merged = df_body_code_extracted_pattern_after_line_break.merge(df_body_code_beginning_string_renamed_column, how='outer', left_index=True, right_index=True)

In [None]:
# Check the columns name
dfs_body_code_merged.columns

In [None]:
# Drop y columns
dfs_body_code_merged_dropped_columns = dfs_body_code_merged.drop(columns=['CALIBRATION_y', 'TYPE_y', 'OLD PART NUMBER(S)_y',
       'NEW PART NUMBER(s)_y', 'Year_y', 'BC Description_y'])

In [None]:
# Replace empty strings at column Body_x
# ^$ - string vazia, nada antes e nada depis
# str.replace() - used ONLY with strings
dfs_body_code_merged_dropped_columns['Body_x'] = dfs_body_code_merged_dropped_columns['Body_x'].replace(r'^$', pd.NA, regex=True) 

In [None]:
# Fill the empty body codes at column body_x (main df) from column body_y
dfs_body_code_merged_dropped_columns['Body_x'] = dfs_body_code_merged_dropped_columns['Body_x'].fillna(dfs_body_code_merged_dropped_columns['Body_y'])

In [None]:
# Remove column body_y 
df_merged_drop_y_column = dfs_body_code_merged_dropped_columns.drop(columns=['Body_y'])

In [None]:
# Remove _x from columns and return the columns to their original names
df_merged_renamed_x_columns = df_merged_drop_y_column.rename(columns={
    'CALIBRATION_x': 'CALIBRATION',
    'TYPE_x': 'TYPE',
    'OLD PART NUMBER(S)_x': 'OLD PART NUMBER(S)',
    'NEW PART NUMBER(s)_x': 'NEW PART NUMBER(s)',
    'Year_x': 'Year',
    'Body_x': 'Body',
    'BC Description_x': 'BC Description'
    })

In [None]:
# Confirming how many rows at body column are filled with empty string ''
len(df_merged_renamed_x_columns[df_merged_renamed_x_columns['Body'] == ''])

In [None]:
# Confirming how many rows are empty
len(df_merged_renamed_x_columns[df_merged_renamed_x_columns['Body'] == pd.NA])

In [None]:
# Function to create a new column with the body code length
def body_code_len(df):
    # Create a list with the body codes length
    body_code_length = []

    # Loop to get each body code
    for code in df['Body']:
        # Count the lenght of each body code
        bc_len = len(code)
    # Append the lenght
        body_code_length.append(bc_len)

    # Add a body code lenght column to the main df
    df['BC Length'] = body_code_length
    return df 

In [None]:
# Add a body code lenght column to the main df
df_merged_bc_len = body_code_len(df_merged_renamed_x_columns)

### BODY CODE CHECKING

In [None]:
# Check the body codes based on their length
df_merged_bc_len['BC Length'].value_counts()

In [None]:
# Method to capture the body codes individually
bc_pattern = r'[A-Z0-9]{2}'

In [None]:
# Create a list with unique body codes
unique_bc_list = []

# Iterate under column bc to get the string with the bcs
for code in df_merged_bc_len['Body']:
    # Method findall find the matches based on the pattern
    bc_match = re.findall(bc_pattern, code)
    if code != '':
        unique_bcs = list(set(bc_match))
        bc_out_list = ', '.join(unique_bcs)
        unique_bc_list.append(bc_out_list)
    else:
        unique_bc_list.append(code) 

In [None]:
# Update the main df
df_merged_bc_len['Body'] = unique_bc_list

In [None]:
# Call the function to update the bc len
df_bc_len_updated = body_code_len(df_merged_bc_len)

In [None]:
# Call the function to get body code length
df_bc_len_updated['BC Length'].value_counts()

In [None]:
# Check the body codes with the description to confirm they are all correct
# Creat a list with all bc_descriptions
bc_descriptions_list = []

# Iterate under the bc descr column to get all bc desc
for desc in df_bc_len_updated['BC Description']:
    bc_descriptions_list.append(desc)

In [None]:
# Convert the bc description list to a unique string
# split method converts the unique string to a list of string separated by ', '
bc_descr_list = ', '.join(bc_descriptions_list).split(', ')

In [None]:
# Create a list with unique bc descr
unique_bc_descr_list = set(bc_descr_list)

In [None]:
# Print the list
print(unique_bc_descr_list)

### ENGINE SIZE EXTRACTION

In [None]:
# Extract the engine size
df_year_body_engine_size = extract_data(df_bc_len_updated, 'Engine Size', 'CALIBRATION', engine_size_pattern)

In [None]:
# Print the head to see the results
df_year_body_engine_size.head(5)

In [None]:
# List with engine sizes updated
engine_size_list = []

# Iterate over column Engine size to add 'L' to engine sizes without L
for engine in df_year_body_engine_size['Engine Size']:
    if len(engine) == 3:
        engine_updated = engine + 'L'
        engine_size_list.append(engine_updated)
    else:
        engine_size_list.append(engine)

In [None]:
# Update the df with the updated engine sizes
df_year_body_engine_size['Engine Size'] = engine_size_list

In [None]:
# See the head to see the results
df_year_body_engine_size.head(7)

In [None]:
# Remove char '\n'
df_year_body_engine_size_removed_char = df_year_body_engine_size.replace(r'\n', ' ', regex=True)

In [None]:
# Print the head to see the results
df_year_body_engine_size_removed_char.head(7)

In [None]:
# Add , between whitespaces in the old part number column
df_year_body_engine_size_removed_char['OLD PART NUMBER(S)'] = df_year_body_engine_size_removed_char['OLD PART NUMBER(S)'].str.replace(' ', ', ', regex=False)

In [None]:
# Print the head to see the results
df_year_body_engine_size_removed_char.head(10)

In [None]:
# Create a list with all engine sizes to confirm if they are correct.
engine_size_list = []

# Iterate over the engine size column to get the engine size
for es in df_year_body_engine_size_removed_char['Engine Size']:
    engine_size_list.append(es)

In [None]:
# Convert the engine size list to a unique string,
# split method converts the unique string to a list of strings separated by ', '
list_of_engine_size = ', '.join(engine_size_list).split(', ')

In [None]:
# Create a list with unique engine size set
unique_list_of_engine_size = set(list_of_engine_size)

In [None]:
# Print the engine sizes to check if they are correct
print(unique_list_of_engine_size)

In [None]:
# Export the result to .csv file
# df_year_body_engine_size_removed_char.to_csv('C:\Language_Projects\Language_Projects\Python\Flagship_1\pdf_data_extract\data\chrysler_pdf_extract_to_csv.csv')