In [1]:
import pandas as pd
import numpy as np

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib

pd.set_option('display.max_columns', None)
Data = pd.read_csv('../data/data_with_merged_cols_logo_design.csv')
Data

Unnamed: 0,Location,Time,Category,Que/Ans,Urgent,High,Verified,Additional,Frequent
0,"London, SE9",46m ago,Logo Design,What type of organisation is this for? Persona...,0,0,1,1,0
1,"Whitby, YO22",9h ago,Logo Design,What type of organisation is this for? Busines...,0,0,1,0,0
2,"Coleford, GL16",10h ago,Logo Design,What type of organisation is this for? Busines...,0,0,1,1,0
3,"Luton, LU1",11h ago,Logo Design,What type of organisation is this for? Busines...,0,0,1,1,0
4,"Haverhill, CB9",13h ago,Logo Design,What type of organisation is this for? Busines...,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...
357,"Marlow, SL7",1w ago,Logo Design,What type of organisation is this for? Small O...,0,0,0,1,0
358,"Rayleigh, SS6",1w ago,Logo Design,What type of organisation is this for? Busines...,0,0,1,0,0
359,"Slough, SL2",1w ago,Logo Design,What type of organisation is this for? Busines...,0,0,0,1,1
360,Manchester,1w ago,Logo Design,What type of organisation is this for? Persona...,0,0,1,1,0


In [2]:
Data_F = Data.copy()
Data_F

Unnamed: 0,Location,Time,Category,Que/Ans,Urgent,High,Verified,Additional,Frequent
0,"London, SE9",46m ago,Logo Design,What type of organisation is this for? Persona...,0,0,1,1,0
1,"Whitby, YO22",9h ago,Logo Design,What type of organisation is this for? Busines...,0,0,1,0,0
2,"Coleford, GL16",10h ago,Logo Design,What type of organisation is this for? Busines...,0,0,1,1,0
3,"Luton, LU1",11h ago,Logo Design,What type of organisation is this for? Busines...,0,0,1,1,0
4,"Haverhill, CB9",13h ago,Logo Design,What type of organisation is this for? Busines...,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...
357,"Marlow, SL7",1w ago,Logo Design,What type of organisation is this for? Small O...,0,0,0,1,0
358,"Rayleigh, SS6",1w ago,Logo Design,What type of organisation is this for? Busines...,0,0,1,0,0
359,"Slough, SL2",1w ago,Logo Design,What type of organisation is this for? Busines...,0,0,0,1,1
360,Manchester,1w ago,Logo Design,What type of organisation is this for? Persona...,0,0,1,1,0


In [3]:
import re

# Define all possible question patterns and their desired column headers
questions = {
    "What type of organisation is this for\\?": "What type of organisation is this for?",
    "Do you already have a logo\\?": "Do you already have a logo?",
    "How many logo designs are you looking for\\?": "How many logo designs are you looking for?",
    "How soon would you like the project to begin\\?": "How soon would you like the project to begin?",
    "When do you need this service completed?\\?": "When do you need this service completed??",
    "Can we help with any other business needs\\?": "Can we help with any other business needs?",
    "Do you want someone local?\\?": "Do you want someone local?"

}

# Function to extract answers for each row
def extract_answers(row):
    extracted = {v: None for v in questions.values()}
    pattern = "(" + "|".join(questions.keys()) + ")"
    parts = re.split(pattern, row)

    i = 1
    while i < len(parts) - 1:
        q = parts[i].strip()
        a = parts[i + 1].strip()
        for k, v in questions.items():
            if re.fullmatch(k, q):
                if extracted[v] is None:
                    extracted[v] = a
                elif v == "Additional Details":
                    # Append multiple Additional Details if present
                    extracted[v] += " | " + a
        i += 2

    return pd.Series(extracted)

# Apply the function to your DataFrame
result_df = Data['Que/Ans'].apply(extract_answers)
# Clean encoding artifacts (like Â, â€¦, etc.)
result_df = result_df.replace({r'[^\x00-\x7F]+': ''}, regex=True)

# Preview the cleaned DataFrame
result_df.head()

Unnamed: 0,What type of organisation is this for?,Do you already have a logo?,How many logo designs are you looking for?,How soon would you like the project to begin?,When do you need this service completed??,Can we help with any other business needs?,Do you want someone local?
0,Personal,No - I do not have a logo,1 concept,Less than 3 months,As soon as possible,None Additional details Froom my name Additi...,
1,Business,No - I do not have a logo,1 concept,It has already begun,As soon as possible,,
2,Business,No - I do not have a logo,1 concept,Less than 2 months,As soon as possible,Web Design,
3,Business,No - I do not have a logo,1 concept,I'm not sure,In the next few days,None Additional details I want to make a logo...,
4,Business,No - I do not have a logo,1 concept,Less than 3 months,In the next few weeks,None Additional details I install Kitchen wor...,


In [4]:
Data = pd.concat([Data, result_df], axis=1)

In [5]:
columns_to_drop = [
    'Time','Que/Ans','Category','Additional Details'
]

# Drop the columns (if they exist in the DataFrame)
Data = Data.drop(columns=[col for col in columns_to_drop if col in Data.columns])

pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', None)   # Show all rows


df = pd.DataFrame(Data)
df.to_csv('../data/sample_data_for_prediction_logo_design.csv', index=False)

df


Unnamed: 0,Location,Urgent,High,Verified,Additional,Frequent,What type of organisation is this for?,Do you already have a logo?,How many logo designs are you looking for?,How soon would you like the project to begin?,When do you need this service completed??,Can we help with any other business needs?,Do you want someone local?
0,"London, SE9",0,0,1,1,0,Personal,No - I do not have a logo,1 concept,Less than 3 months,As soon as possible,None Additional details Froom my name Additi...,
1,"Whitby, YO22",0,0,1,0,0,Business,No - I do not have a logo,1 concept,It has already begun,As soon as possible,,
2,"Coleford, GL16",0,0,1,1,0,Business,No - I do not have a logo,1 concept,Less than 2 months,As soon as possible,Web Design,
3,"Luton, LU1",0,0,1,1,0,Business,No - I do not have a logo,1 concept,I'm not sure,In the next few days,None Additional details I want to make a logo...,
4,"Haverhill, CB9",0,0,1,1,0,Business,No - I do not have a logo,1 concept,Less than 3 months,In the next few weeks,None Additional details I install Kitchen wor...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,"Marlow, SL7",0,0,0,1,0,Small Organisation,No - I do not have a logo,1 concept,Less than 3 months,I'm flexible,None Additional details . Hello design Addit...,
358,"Rayleigh, SS6",0,0,1,0,0,Business,No - I do not have a logo,1 concept,Less than 1 month,In next few months,,
359,"Slough, SL2",0,0,0,1,1,Business,No - I do not have a logo,2 designs,Less than 3 months,As soon as possible,None Additional details Logo required for sma...,
360,Manchester,0,0,1,1,0,Personal,No - I do not have a logo,3 designs,Less than 2 weeks,As soon as possible,None Additional details I&#039;m in search of...,
