In [1]:
import pandas as pd
import numpy as np

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib

pd.set_option('display.max_columns', None)
Data = pd.read_csv('../data/data_with_merged_cols_business_consulting.csv')
Data

Unnamed: 0,Location,Time,Category,Que/Ans,Urgent,High,Verified,Additional,Frequent
0,"Kegworth, Leicestershire",1w ago,Business Consulting,Have you used business consulting services bef...,0,0,1,1,0
1,"London, TW9",1w ago,Business Consulting,Have you used business consulting services bef...,1,0,1,1,0
2,"London, KT3",1w ago,Business Consulting,Have you used business consulting services bef...,1,0,0,1,0
3,"Sheffield, S13",1w ago,Business Consulting,Have you used business consulting services bef...,1,0,1,1,0
4,"Cobham, KT11",1w ago,Business Consulting,Have you used business consulting services bef...,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...
382,"Kensington, Greater London",1d ago,Business Consulting,Have you used business consulting services bef...,0,0,1,0,1
383,", Wembley, HA9",1d ago,Business Consulting,Have you used business consulting services bef...,1,0,1,1,0
384,"Broxbourne, EN10",22h ago,Business Consulting,Have you used business consulting services bef...,0,0,0,0,1
385,"Rotherham, S65",18h ago,Business Consulting,Have you used business consulting services bef...,0,0,1,0,0


In [2]:
Data_F = Data.copy()
Data_F

Unnamed: 0,Location,Time,Category,Que/Ans,Urgent,High,Verified,Additional,Frequent
0,"Kegworth, Leicestershire",1w ago,Business Consulting,Have you used business consulting services bef...,0,0,1,1,0
1,"London, TW9",1w ago,Business Consulting,Have you used business consulting services bef...,1,0,1,1,0
2,"London, KT3",1w ago,Business Consulting,Have you used business consulting services bef...,1,0,0,1,0
3,"Sheffield, S13",1w ago,Business Consulting,Have you used business consulting services bef...,1,0,1,1,0
4,"Cobham, KT11",1w ago,Business Consulting,Have you used business consulting services bef...,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...
382,"Kensington, Greater London",1d ago,Business Consulting,Have you used business consulting services bef...,0,0,1,0,1
383,", Wembley, HA9",1d ago,Business Consulting,Have you used business consulting services bef...,1,0,1,1,0
384,"Broxbourne, EN10",22h ago,Business Consulting,Have you used business consulting services bef...,0,0,0,0,1
385,"Rotherham, S65",18h ago,Business Consulting,Have you used business consulting services bef...,0,0,1,0,0


In [3]:
import re

# Define all possible question patterns and their desired column headers
questions = {
    "Have you used business consulting services before\\?": "Have you used business consulting services before?",
    "How long has the business been running\\?": "How long has the business been running?",
    "What is the business's annual turnover/sales\\?": "What is the business annual turnover/sales?",
    "How many employees do you have\\?": "How many employees do you have?",
    "What industry is your business in\\?": "What industry is your business in?",
    "Which type\\(s\\) of consulting are you interested in\\?": "Which type(s) of consulting are you interested in?",
    "What are your goals for this service\\?": "What are your goals for this service?",
    "How long do you need a consultant\\?": "How long do you need a consultant?",
    "How would you like to work with the consultant\\?": "How would you like to work with the consultant?",

}

# Function to extract answers for each row
def extract_answers(row):
    extracted = {v: None for v in questions.values()}
    pattern = "(" + "|".join(questions.keys()) + ")"
    parts = re.split(pattern, row)

    i = 1
    while i < len(parts) - 1:
        q = parts[i].strip()
        a = parts[i + 1].strip()
        for k, v in questions.items():
            if re.fullmatch(k, q):
                if extracted[v] is None:
                    extracted[v] = a
                elif v == "Additional Details":
                    # Append multiple Additional Details if present
                    extracted[v] += " | " + a
        i += 2

    return pd.Series(extracted)

# Apply the function to your DataFrame
result_df = Data['Que/Ans'].apply(extract_answers)
# Clean encoding artifacts (like Â, â€¦, etc.)
result_df = result_df.replace({r'[^\x00-\x7F]+': ''}, regex=True)

# Preview the cleaned DataFrame
result_df.head()

Unnamed: 0,Have you used business consulting services before?,How long has the business been running?,What is the business annual turnover/sales?,How many employees do you have?,What industry is your business in?,Which type(s) of consulting are you interested in?,What are your goals for this service?,How long do you need a consultant?,How would you like to work with the consultant?
0,No,Start-up/not started yet,,,Services,Mentoring,Clarification on start-up requirements,I'm not sure,Onsite
1,No,Start-up/not started yet,,,Services,Company strategy,"Increase marketing ROI, Increase sales/lead ge...",I'm not sure,Remotely Additional details Quality Addition...
2,No,20 What is the business&#039;s annual turnove...,,25 or more,Retail/consumer goods,extand,Business growth,Over several weeks,Onsite Additional details we want to expand o...
3,No,Less than 1 year What is the business&#039;s ...,,,Online Video Producer,Finance &amp; planning,Streamlined operations/administration,I'm not sure,Remotely Additional details Hi! I&#039;m Leo....
4,No,5 years or more What is the business&#039;s a...,,10 - 24,Construction,Finance &amp; planning,Cost reduction/efficiency improvements,I'm not sure,I would like to discuss this with the pro Add...


In [4]:
Data = pd.concat([Data, result_df], axis=1)

In [5]:
columns_to_drop = [
    'Time','Que/Ans','Category','Additional Details'
]

# Drop the columns (if they exist in the DataFrame)
Data = Data.drop(columns=[col for col in columns_to_drop if col in Data.columns])

pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', None)   # Show all rows


df = pd.DataFrame(Data)
df.to_csv('../data/sample_data_for_prediction_business_consulting.csv', index=False)

df


Unnamed: 0,Location,Urgent,High,Verified,Additional,Frequent,Have you used business consulting services before?,How long has the business been running?,What is the business annual turnover/sales?,How many employees do you have?,What industry is your business in?,Which type(s) of consulting are you interested in?,What are your goals for this service?,How long do you need a consultant?,How would you like to work with the consultant?
0,"Kegworth, Leicestershire",0,0,1,1,0,No,Start-up/not started yet,,,Services,Mentoring,Clarification on start-up requirements,I'm not sure,Onsite
1,"London, TW9",1,0,1,1,0,No,Start-up/not started yet,,,Services,Company strategy,"Increase marketing ROI, Increase sales/lead ge...",I'm not sure,Remotely Additional details Quality Addition...
2,"London, KT3",1,0,0,1,0,No,20 What is the business&#039;s annual turnove...,,25 or more,Retail/consumer goods,extand,Business growth,Over several weeks,Onsite Additional details we want to expand o...
3,"Sheffield, S13",1,0,1,1,0,No,Less than 1 year What is the business&#039;s ...,,,Online Video Producer,Finance &amp; planning,Streamlined operations/administration,I'm not sure,Remotely Additional details Hi! I&#039;m Leo....
4,"Cobham, KT11",0,0,1,1,1,No,5 years or more What is the business&#039;s a...,,10 - 24,Construction,Finance &amp; planning,Cost reduction/efficiency improvements,I'm not sure,I would like to discuss this with the pro Add...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,"Kensington, Greater London",0,0,1,0,1,No,Start-up/not started yet,,,Restaurant/food,"Business plans, Company strategy, Finance &amp...","Business growth, Increase sales/lead generation",I'm not sure,Remotely
383,", Wembley, HA9",1,0,1,1,0,No,Start-up/not started yet,,,ecommerce,"Business plans, Market research &amp; surveys,...",Increase sales/lead generation,For ongoing engagement,Remotely Additional details WANT AN ECOMMERCE...
384,"Broxbourne, EN10",0,0,0,0,1,No,all What is the business&#039;s annual turnov...,,1 - 4,Services,"Business plans, Company strategy, Finance &amp...","Business growth, Cost reduction/efficiency imp...",I'm not sure,I would like to discuss this with the pro
385,"Rotherham, S65",0,0,1,0,0,No,Start-up/not started yet,,,Dog grooming,Help to set up,Business growth,I'm not sure,Onsite
