<a href="https://colab.research.google.com/github/rashikajakhmola/NLP/blob/main/NLP_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 1. Get Request From User

Prompts 
1. I am planning to order the BMW M8 with a sunroof or panorama glass roof sky lounge, and the M Sport Package on 12th April 2018. Is this configuration possible?

2. Hello, is the X7 xDrive40i available without a panorama glass roof and with the EU Comfort Package. I need the vehicle on the 8th of November 2024.

3. I want to order a BMW iX with right-hand drive configuration. I will be ordering it at the start of October 2022.

In [None]:
def get_request():
  
    user_request = input("Please state your request: ")
    return user_request

init_prompt = get_request()

Please state your request: I want to order a BMW iX with right-hand drive configuration. I will be ordering it at the start of October 2022.


### 2. Remove Determiners from the User Prompt

In [None]:
import spacy

def remove_determiners(sentence):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)

    # Create a new sentence without determiners
    new_sentence = []
    for token in doc:
        if token.pos_ != 'DET' and token.pos_ != 'PUNCT':  # Exclude determiners and Punctuation marks
            new_sentence.append(token.text)

    # Join the tokens to form the new sentence
    new_sentence = ' '.join(new_sentence)
    return new_sentence

prompt = remove_determiners(init_prompt)
prompt

'I want to order BMW iX with right hand drive configuration I will be ordering it at start of October 2022'

### 3. Extract Date from the prompt

In [None]:
def extract_dates(sentence):
    nlp = spacy.load("en_core_web_sm")
    ner = nlp(sentence)
    dates = ""
    
    for word in ner.ents:
        if word.label_ == "DATE":
            dates = word.text
            break  # Assuming there is only one date in the sentence
    
    return dates

date_prompt = extract_dates(prompt)
date_prompt

'October 2022'

In [None]:
from datetime import datetime

def convert_date_format(date_string):
    # Check if the input has the format 'Month Year' (e.g., 'October 2023')
    if len(date_string.split()) == 2:
        # Add '1st' to the date string
        date_string = '1st ' + date_string

    # Remove the ordinal suffix (e.g., 'th', 'rd', etc.)
    date_string = date_string.replace('th', '').replace('st', '').replace('nd', '').replace('rd', '')

    try:
        # Convert the date string to a datetime object using '%d of %B %Y' format
        date_object = datetime.strptime(date_string, '%d of %B %Y')
    except ValueError:
        try:
            # If the above format fails, try converting using '%d %B %Y' format
            date_object = datetime.strptime(date_string, '%d %B %Y')
        except ValueError:
            return 'Invalid date format'

    # Convert the datetime object to the desired format
    formatted_date = date_object.strftime('%Y-%m-%d')

    return formatted_date

converted_date = convert_date_format(date_prompt)
converted_date

'2022-10-01'

In [None]:
def wrap_date_in_dict(date):
    return {'dates': [date]}

date_dict = wrap_date_in_dict(converted_date)
date_dict

{'dates': ['2022-10-01']}

### 4. Extract Model From the Prompt

In [None]:
import csv

def find_matching_phrases(input_sentence):
    matches = {}
    first_word_matches = {}

    with open('Codes.csv', 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            if row['Type'] == 'model':
                if row['Description'] in input_sentence:
                    matches['modelTypeCodes'] = [row['Description']]
                else:
                    first_word = row['Description'].split()[0]
                    if first_word in input_sentence:
                        if 'modelTypeCodes' not in first_word_matches:
                            first_word_matches['modelTypeCodes'] = []
                        first_word_matches['modelTypeCodes'].append(row['Description'])

    if matches:
        return matches

    return first_word_matches

# Example usage
model_dict = find_matching_phrases(prompt)
model_dict

{'modelTypeCodes': ['iX xDrive50', 'iX xDrive40']}

In [None]:
import csv

def replace_with_abbreviation(data_dict):
    # Load CSV data into a dictionary
    csv_data = {}
    with open('Codes.csv', 'r') as csv_file:
        reader = csv.DictReader(csv_file)
        for row in reader:
            abbreviation = row['Abbreviation']
            description = row['Description']
            csv_data[description] = abbreviation
    
    # Iterate over the dictionary values
    for key, values in data_dict.items():
        for i in range(len(values)):
            value = values[i]
            if value in csv_data:
                values[i] = csv_data[value]
    
    return data_dict

# Example usage
fin_model_dict = replace_with_abbreviation(model_dict)
fin_model_dict

{'modelTypeCodes': ['21CF', '11CF']}

### 5. Split the Prompt on Adpositions

In [None]:
def split_sentence(sentence):
    result = []
    current_item = ""
    for word in sentence.split():
        if word.lower() in ['with', 'without', 'and', 'or', 'and not', 'or not']:
            if current_item:
                result.append(current_item.strip())
                current_item = ""
        current_item += word + " "
    if current_item:
        result.append(current_item.strip())
    return result

prompt_list = split_sentence(prompt)
prompt_list

['I want to order BMW iX',
 'with right hand drive configuration I will be ordering it at start of October 2022']

### 6. Create a dataframe with Adpositions and Features

In [None]:
import pandas as pd

def split_phrases(phrases):
    adp = ['with', 'without', 'and', 'or', 'and not', 'or not']
    df = pd.DataFrame(columns=['Adposition', 'Feature'])

    current_adposition = None
    current_phrase = ''

    for phrase in phrases:
        if phrase.startswith(tuple(adp)):
            if current_adposition:
                df = pd.concat([df, pd.DataFrame({'Adposition': [current_adposition], 'Feature': [current_phrase.strip()]})], ignore_index=True)
            parts = phrase.split(' ', 1)
            current_adposition = parts[0]
            current_phrase = parts[1] if len(parts) > 1 else ''
        else:
            current_phrase += ' ' + phrase
    
    if current_adposition:
        df = pd.concat([df, pd.DataFrame({'Adposition': [current_adposition], 'Feature': [current_phrase.strip()]})], ignore_index=True)
    
    return df

adp_df = split_phrases(prompt_list)
adp_df

Unnamed: 0,Adposition,Feature
0,with,right hand drive configuration I will be order...


### 7. Update the Dataframe by Comparing Feature names to CSV File

In [None]:
import pandas as pd
from itertools import permutations

def update_dataframe(csv_file, dataframe):
    # Read the CSV file
    csv_data = pd.read_csv(csv_file)
    
    # Convert the 'Description' column to a list
    descriptions = csv_data['Description'].tolist()
    
    # Iterate over each row in the dataframe
    for index, row in dataframe.iterrows():
        feature = row['Feature']
        
        # Check if the feature is blank or contains only whitespace
        if pd.isna(feature) or feature.strip() == "":
            dataframe.drop(index, inplace=True)
            continue
        
        # Iterate over each description in the CSV
        for description in descriptions:
            
            # Generate all possible permutations of the description
            perms = [' '.join(p) for p in permutations(description.lower().split()) ]
            
            # Check if any permutation of the description is in the feature
            if any(perm in feature.lower() or description.lower() == feature.lower() for perm in perms):
                # Replace the feature with the description
                dataframe.at[index, 'Feature'] = description
                break
    
    dataframe = dataframe.dropna(subset=['Feature'])

    return dataframe

csv_file = 'Codes.csv'

updated_df = update_dataframe(csv_file, adp_df)
updated_df

Unnamed: 0,Adposition,Feature
0,with,Right Hand Drive


### 8. Add Feature_Type column to the dataframe

In [None]:
import pandas as pd

def add_feature_type(dataframe, csv_file):
    # Read the CSV file into a dataframe
    csv_data = pd.read_csv(csv_file)
    
    # Create a dictionary mapping features to their types
    feature_type_mapping = dict(zip(csv_data['Description'], csv_data['Type']))
    
    # Initialize an empty list to store the feature types
    feature_types = []
    
    # Iterate over the rows of the input dataframe
    for index, row in dataframe.iterrows():
        feature = row['Feature']
        
        # Check if the feature is present in the mapping dictionary
        if feature in feature_type_mapping:
            feature_type = feature_type_mapping[feature]
        else:
            feature_type = ''
        
        feature_types.append(feature_type)
    
    # Add the 'Feature_Type' column to the input dataframe
    dataframe['Feature_Type'] = feature_types
    
    return dataframe

csv_file_path = 'Codes.csv'
output_df = add_feature_type(updated_df, csv_file_path)
output_df

Unnamed: 0,Adposition,Feature,Feature_Type
0,with,Right Hand Drive,steering_wheel_config


### 9. Convert Features to Abbreviations

In [None]:
import pandas as pd

def replace_features_with_abbreviations(csv_file, dataframe):
    # Read the CSV file
    csv_data = pd.read_csv(csv_file)
    
    # Create a dictionary mapping Description to Abbreviation
    abbreviation_dict = {
        row['Description']: row['Abbreviation']
        for _, row in csv_data.iterrows()
    }
    
    # Replace the Feature values in the DataFrame with Abbreviation values
    dataframe['Feature'] = dataframe['Feature'].map(abbreviation_dict).fillna(dataframe['Feature'])
    
    return dataframe

csv_file = 'Codes.csv'
codes_df = replace_features_with_abbreviations(csv_file, output_df)
codes_df

Unnamed: 0,Adposition,Feature,Feature_Type
0,with,RL,steering_wheel_config


In [None]:
import pandas as pd

def combine_columns(df):
    df['Adp+Feature'] = df['Adposition'] + ' ' + df['Feature']
    df.drop(['Adposition', 'Feature'], axis=1, inplace=True)
    return df

result_df = combine_columns(codes_df)
result_df

Unnamed: 0,Feature_Type,Adp+Feature
0,steering_wheel_config,with RL


### 10. Convert the Dataframe into a List of Features

In [None]:
import pandas as pd

def combine_features(dataframe):
    result_dict = {}
    for index, row in dataframe.iterrows():
        feature_type = row['Feature_Type']
        adp_feature = row['Adp+Feature']
        
        if feature_type in result_dict:
            result_dict[feature_type] += " " + adp_feature
        else:
            result_dict[feature_type] = adp_feature
    
    return result_dict
    
feature_dict = combine_features(result_df)
feature_dict

{'steering_wheel_config': 'with RL'}

### 11. Translate the Adpositions into Boolean Operators

In [None]:
import re

def replace_words(input_dict):
    operator_map = {
        "and": "+",
        "with": "+",
        "without": "-",
        "or": "/",
        "and not": "+-",
        "or not": "/-"
    }

    output_dict = {}
    for key, value in input_dict.items():
        for word, operator in operator_map.items():
            pattern = re.compile(r'\b' + re.escape(word) + r'\b', re.IGNORECASE)
            value = re.sub(pattern, operator, value)
        output_dict[key] = value

    return output_dict
    
boolean_dict = replace_words(feature_dict)
boolean_dict

{'steering_wheel_config': '+ RL'}

In [None]:
def process_dictionary(input_dict):
    output_dict = {}
    
    for key, value in input_dict.items():
        if ('+' in value and '/' in value) or ('-' in value and '/' in value) or ('+-' in value and '/' in value):
            # Find the index of the '/' character
            index = value.index('/')
            
            # Replace the term before '/' with '(term before /'
            term_before = value[:index].split()[-1]
            value = value.replace(term_before, '(' + term_before)
            
            # Replace the term after '/' with 'term after /)'
            term_after = value[index + 1:].strip()
            value = value.replace(term_after, term_after + ')')
        
        output_dict[key] = value
    
    return output_dict

fin_bool_dict = process_dictionary(boolean_dict)
fin_bool_dict

{'steering_wheel_config': '+ RL'}

In [None]:
def combine_values(dictionary):
    combined_value = ''.join(dictionary.values())
    output = {'booleanFormulas': [combined_value]}
    return output

bool_form_dict = combine_values(fin_bool_dict)
bool_form_dict

{'booleanFormulas': ['+ RL']}

### 12. Combine all 3 dictionaries into the Final Request Body

In [None]:
def combine_and_split_dictionaries(*dicts):
    result = []
    model_type_codes = dicts[0]['modelTypeCodes']
    
    if len(model_type_codes) > 1:
        for code in model_type_codes:
            new_dict = {}
            for key, value in dicts[0].items():
                if key == 'modelTypeCodes':
                    new_dict[key] = [code]
                else:
                    new_dict[key] = value
            result.append(new_dict)
    else:
        result.append(dicts[0])
    
    for i in range(1, len(dicts)):
        for j in range(len(result)):
            for key, value in dicts[i].items():
                result[j][key] = value
    
    return result

request_body = combine_and_split_dictionaries(fin_model_dict, bool_form_dict,date_dict)
request_body

[{'modelTypeCodes': ['21CF'],
  'booleanFormulas': ['+ RL'],
  'dates': ['2022-10-01']},
 {'modelTypeCodes': ['11CF'],
  'booleanFormulas': ['+ RL'],
  'dates': ['2022-10-01']}]