In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import numpy as np

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
Data = pd.read_csv('../data/logo_design_data.csv')

In [None]:
Data

In [None]:
Data_F = Data.copy()

In [None]:
Data_F

In [None]:
import pandas as pd
import re

# Assuming your DataFrame is already loaded and has the column 'Que/Ans'
# Example: Data = pd.read_csv('your_file.csv')

# Define all possible question patterns and their desired column headers
questions = {
    "What type of organisation is this for\\?": "What type of organisation is this for?",
    "Do you already have a logo\\?": "Do you already have a logo?",
    "How many logo designs are you looking for\\?": "How many logo designs are you looking for?",
    "How soon would you like the project to begin\\?": "How soon would you like the project to begin?",
    "When do you need this service completed?\\?": "When do you need this service completed??",
    "Can we help with any other business needs\\?": "Can we help with any other business needs?",
    "Do you want someone local?\\?": "Do you want someone local?"

}

# Function to extract answers for each row
def extract_answers(row):
    extracted = {v: None for v in questions.values()}
    pattern = "(" + "|".join(questions.keys()) + ")"
    parts = re.split(pattern, row)

    i = 1
    while i < len(parts) - 1:
        q = parts[i].strip()
        a = parts[i + 1].strip()
        for k, v in questions.items():
            if re.fullmatch(k, q):
                if extracted[v] is None:
                    extracted[v] = a
                elif v == "Additional Details":
                    # Append multiple Additional Details if present
                    extracted[v] += " | " + a
        i += 2

    return pd.Series(extracted)

# Apply the function to your DataFrame
result_df = Data['Que/Ans'].apply(extract_answers)

# Clean encoding artifacts (like Â, â€¦, etc.)
result_df = result_df.replace({r'[^\x00-\x7F]+': ''}, regex=True)

# Preview the cleaned DataFrame
result_df.head()

# Optional: Save to Excel or CSV
# result_df.to_csv('cleaned_landscaping_data.csv', index=False)
# result_df.to_excel('cleaned_landscaping_data.xlsx', index=False)


In [None]:
Data = pd.concat([Data, result_df], axis=1)

In [None]:
Data

In [None]:
columns_to_drop = ['S.no', 'Lead Id', 'Response','Country','Additional Details',
    'Initial', 'Name','Time', 'Badge 1','Additional',
    'Badge 2', 'Badge 3', 'Badge 4', 'Category', 'Que/Ans'
]

# Drop the columns (if they exist in the DataFrame)
Data = Data.drop(columns=[col for col in columns_to_drop if col in Data.columns])

In [None]:
Data.columns

In [None]:
# Load dataset
df = Data

# Define target variable
target = "Credits"

# Define features
features = [
   'Location','Urgent', 'High', 'Verified', 'Frequent',
       'What type of organisation is this for?', 'Do you already have a logo?',
       'How many logo designs are you looking for?',
       'How soon would you like the project to begin?',
       'When do you need this service completed??',
       'Can we help with any other business needs?',
       'Do you want someone local?']


In [None]:
# Drop rows with missing target values
df = df.dropna(subset=[target])

In [None]:

# Identify categorical and numerical features
cat_features = df[features].select_dtypes(include=['object']).columns.tolist()
num_features = df[features].select_dtypes(include=['int64', 'float64']).columns.tolist()

In [None]:

# Fill missing values
df[cat_features] = df[cat_features].fillna("Unknown")
df[num_features] = df[num_features].fillna(df[num_features].median())

In [None]:

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# Convert to CatBoost Pool format
train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
test_pool = Pool(X_test, label=y_test, cat_features=cat_features)


In [None]:

# Initialize CatBoost model
model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=8,
    loss_function='RMSE',
    cat_features=cat_features,
    verbose=100
)

In [None]:
# Train model
model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=50)

In [None]:
# Make predictions
y_pred = model.predict(X_test)

In [None]:

# Evaluate model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [None]:
print(f"RMSE: {rmse}")
print(f"R2 Score: {r2}")

In [None]:
validated = df[features]

In [None]:
# Make predictions on NewData without specifying cat_features
NewDataPrediction = model.predict(df[features])

In [None]:
import catboost
import numpy as np
import pandas as pd

# Get SHAP values, including the base value, for all records in NewData
shap_values = model.get_feature_importance(
    data=catboost.Pool(validated, cat_features=cat_features),
    type="ShapValues"
)

# Get predicted values
predicted_values = model.predict(validated)

# Convert SHAP values to DataFrame (include base value)
shap_columns = list(validated.columns) + ["base_value"]
shap_df = pd.DataFrame(shap_values, columns=shap_columns)

# Add predicted values column
shap_df["predicted_value"] = predicted_values

# Display SHAP values with predictions
shap_df  # Show SHAP values for the first few records including prediction




In [None]:
shap_df1 = shap_df.merge(Data_F[["Credits","Lead Id"]], left_index=True, right_index=True)

In [None]:
shap_df1

In [None]:
shap_df1.to_csv("../data/New_Scores_Logo_Design.csv", index=False)

In [None]:
# Save model
joblib.dump(model, "../model/logo_design.pkl")

print("Model saved successfully.")

In [None]:
# Save the filtered dataset
#df_results.to_csv("Scores_Actual_vs_Predicted.csv", index=False)