In [1]:
import pandas as pd
import re

In [2]:
# Read .csv file
df_years_extracted = pd.read_csv("E:\Language_Projects\Language_Projects\Python\Flagship_1\pdf_data_extract\data\df_years_extracted.csv")

  df_years_extracted = pd.read_csv("E:\Language_Projects\Language_Projects\Python\Flagship_1\pdf_data_extract\data\df_years_extracted.csv")


In [4]:
def extract_body_code(df, new_column, column, pattern):
    '''Function to capture the data using the stabilished 
    pattern for year, body code and engine size'''
    df[new_column] = df[column].str.findall(pattern, re.MULTILINE)

    # findall method extracts the desired data from each row and place it in a new row with list form.
    # .str.join method converts the list to a unique string: 
    # [2001, 2002] -> 2001, 2002 (using the separator ', ') 
    df[new_column] = df[new_column].str.join(', ')

    # Return the df after data extraction
    return df

## Body Code Extraction Strategy

The body codes are located in two possible positions within the string: either at the beginning of the string, immediately after the vehicle year, or after the first line break. For greater efficiency in extracting the body code, this is the body code extraction strategic:

1. Extract the body code after the first line break.
2. If not found, extract the body code at the beginning of the string, immediately after the vehicle year.

### AFTER THE FIRST LINE BREAK

In [None]:
# First body code capture, after line break '\n'
# \n - line break
# [A-Z0-9] - upper letter followed by number
# {2} - 2 digits
# \s+ - 1 or more whitespaces
body_code_pattern_after_line_break = r'\n([A-Z0-9]{2})\s+-'

In [5]:
# Extract the body code
df_body_code_extracted_pattern_after_line_break = extract_body_code(df_years_extracted, 'Body', 'CALIBRATION', body_code_pattern_after_line_break)

In [7]:
# Filter the empty rows and save in a variable to check for next pattern
df_bc_and_bc_desc = df_body_code_extracted_pattern_after_line_break[df_body_code_extracted_pattern_after_line_break['Body'] == '']

In [8]:
# Check df info
df_bc_and_bc_desc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2559 entries, 13 to 15588
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   CALIBRATION         2559 non-null   object
 1   Year                2553 non-null   object
 2   TYPE                2559 non-null   object
 3   OLD PART NUMBER(S)  2097 non-null   object
 4   NEW PART NUMBER(s)  2559 non-null   object
 5   Body                2559 non-null   object
dtypes: object(6)
memory usage: 139.9+ KB


### AT THE BEGINNING OF THE STRING 

In [67]:
# Pattern to capture the body codes at the beginning of the string
# (? - find this group but does not capture
# <= - lookbehind, check if the pattern is before but does not capture
# \d{4}\s) - 4 digits and 1 whitespace
# (?:\s[A-Z0-9]{2})* - capture 0 or more body codes all followed by whitespaces
# \b - make sure finishes at the end of the last body code
body_code_pattern_beginning_string = r'(?<=\d{4}\s)([A-Z0-9]{2}(?:\s[A-Z0-9]{2})*)\b'

In [68]:
# Call the function to extract the remainig body codes
df_body_code_beginning_string = extract_body_code(df_bc_and_bc_desc, 'BC', 'CALIBRATION', body_code_pattern_beginning_string)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[new_column] = df[column].str.findall(pattern, re.MULTILINE)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[new_column] = df[new_column].str.join(', ')


In [69]:
# Drop column body
df_body_code_beginning_string_dropped_column = df_body_code_beginning_string.drop(columns=['Body'])

In [70]:
# Rename the column from BC to Body
df_body_code_beginning_string_renamed_column = df_body_code_beginning_string_dropped_column.rename(columns={'BC': 'Body'})

In [71]:
# Replace ' ' between the body codes for ', '
df_body_code_beginning_string_renamed_column['Body'] = df_body_code_beginning_string_renamed_column['Body'].str.replace(' ', ', ', regex=False) 

In [72]:
# Change the column order to match the main df
df_body_code_beginning_string_renamed_column = df_body_code_beginning_string_renamed_column[['CALIBRATION', 'TYPE', 'OLD PART NUMBER(S)', 'NEW PART NUMBER(s)', 'Year', 'Body']]

In [73]:
# Merge the main df with the df with the extracted codes at the beginning of the string
# left_index, right_index - the index will be the key match between both dataframes, such as a match ID on both dfs
# outer: use union of keys from both frames
dfs_body_code_merged = df_body_code_extracted_pattern_after_line_break.merge(df_body_code_beginning_string_renamed_column, how='outer', left_index=True, right_index=True)

In [74]:
# Check the columns name
dfs_body_code_merged.columns

Index(['CALIBRATION_x', 'Year_x', 'TYPE_x', 'OLD PART NUMBER(S)_x',
       'NEW PART NUMBER(s)_x', 'Body_x', 'CALIBRATION_y', 'TYPE_y',
       'OLD PART NUMBER(S)_y', 'NEW PART NUMBER(s)_y', 'Year_y', 'Body_y'],
      dtype='object')

In [75]:
# Drop y columns
dfs_body_code_merged_dropped_columns = dfs_body_code_merged.drop(columns=['CALIBRATION_y', 'TYPE_y', 'OLD PART NUMBER(S)_y',
       'NEW PART NUMBER(s)_y', 'Year_y'])

In [76]:
# Replace empty strings at column Body_x
# ^$ - empty string, nothing before and nothing after
# str.replace() - used ONLY with strings
dfs_body_code_merged_dropped_columns['Body_x'] = dfs_body_code_merged_dropped_columns['Body_x'].replace(r'^$', pd.NA, regex=True) 

In [77]:
# Fill the empty body codes at column body_x (main df) from column body_y
dfs_body_code_merged_dropped_columns['Body_x'] = dfs_body_code_merged_dropped_columns['Body_x'].fillna(dfs_body_code_merged_dropped_columns['Body_y'])

In [78]:
# Remove column body_y 
df_merged_drop_y_column = dfs_body_code_merged_dropped_columns.drop(columns=['Body_y'])

In [79]:
# Remove _x from columns and return the columns to their original names
df_merged_renamed_x_columns = df_merged_drop_y_column.rename(columns={
    'CALIBRATION_x': 'CALIBRATION',
    'TYPE_x': 'TYPE',
    'OLD PART NUMBER(S)_x': 'OLD PART NUMBER(S)',
    'NEW PART NUMBER(s)_x': 'NEW PART NUMBER(s)',
    'Year_x': 'Year',
    'Body_x': 'Body',
    'BC Description_x': 'BC Description'
    })

In [80]:
# Confirming how many rows at body column are filled with empty string ''
len(df_merged_renamed_x_columns[df_merged_renamed_x_columns['Body'] == ''])

105

In [81]:
# Confirming how many rows are empty
len(df_merged_renamed_x_columns[df_merged_renamed_x_columns['Body'] == pd.NA])

0

In [83]:
# Export to .csv file
df_merged_renamed_x_columns.to_csv("E:\Language_Projects\Language_Projects\Python\Flagship_1\pdf_data_extract\data\df_year_bc_extracted.csv", index=False)

  df_merged_renamed_x_columns.to_csv("E:\Language_Projects\Language_Projects\Python\Flagship_1\pdf_data_extract\data\df_year_bc_extracted.csv", index=False)
