In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import numpy as np

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
Data = pd.read_csv('../data/artificial_grass_data.csv')

In [None]:
Data

In [None]:
Data_F = Data.copy()

In [None]:
Data_F

In [None]:
import pandas as pd
import re

# Cleaned question labels
question_labels = {
    "What type of property is this for\\?": "What type of property is this for?",
    "How big is the area where you want artificial grass\\?": "How big is the area where you want artificial grass?",
    "What kind of preparation needs to be done before installing the turf\\?": "What kind of preparation needs to be done before installing the turf?",
    "What kind of artificial grass do you need\\?": "What kind of artificial grass do you need?",
    "When would you like the work to be done\\?": "When would you like the work to be done?",
    "How likely are you to hire a professional\\?": "How likely are you to hire a professional?",
    "Additional details[:]?": "Additional Details"
}

# Precompile full question matching pattern
question_regex = "(" + "|".join(question_labels.keys()) + ")"
compiled_pattern = re.compile(question_regex, flags=re.IGNORECASE)

# Clean answer text
def clean_answer(text):
    return re.sub(r'^[^\w\d]*', '', text).strip() if isinstance(text, str) else text

# Extractor function
def extract_answers(row_text):
    parts = compiled_pattern.split(row_text)
    parts = [p for p in parts if p and p.strip()]

    extracted = {label: None for label in question_labels.values()}

    i = 0
    while i < len(parts) - 1:
        question_match = parts[i].strip()
        answer_text = parts[i + 1].strip() if i + 1 < len(parts) else ''
        for q_pattern, q_label in question_labels.items():
            if re.fullmatch(q_pattern, question_match, flags=re.IGNORECASE):
                if not extracted[q_label]:
                    extracted[q_label] = clean_answer(answer_text)
        i += 2

    return pd.Series(extracted)

# Apply to dataframe
result_df = Data['Que/Ans'].apply(extract_answers)

# View result
result_df


In [None]:
Data = pd.concat([Data, result_df], axis=1)

In [None]:
Data

In [None]:
Datak = Data
columns_to_drop = ['S.no', 'Response','Country','Additional Details',
    'Initial', 'Name','Time', 'Badge 1','Additional','Credits',
    'Badge 2', 'Badge 3', 'Badge 4', 'Category', 'Que/Ans'
]

# Drop the columns (if they exist in the DataFrame)
Datak = Datak.drop(columns=[col for col in columns_to_drop if col in Datak.columns])

Datak.to_csv("../data/sample_data_for_prediction_artificial_grass.csv", index=False)

In [None]:
columns_to_drop = ['Location', 'S.no', 'Lead Id', 'Response','Country','Additional Details',
    'Initial', 'Name','Time', 'Badge 1','Additional',
    'Badge 2', 'Badge 3', 'Badge 4', 'Category', 'Que/Ans'
]

# Drop the columns (if they exist in the DataFrame)
Data = Data.drop(columns=[col for col in columns_to_drop if col in Data.columns])

In [None]:
Data

In [None]:
# Load dataset
df = Data

# Define target variable
target = "Credits"

# Define features
features = [
       'Urgent', 'High', 'Verified', 'Frequent',
       'What type of property is this for?',
       'How big is the area where you want artificial grass?',
       'What kind of preparation needs to be done before installing the turf?',
       'What kind of artificial grass do you need?',
       'When would you like the work to be done?',
       'How likely are you to hire a professional?']


In [None]:
# Drop rows with missing target values
df = df.dropna(subset=[target])

In [None]:

# Identify categorical and numerical features
cat_features = df[features].select_dtypes(include=['object']).columns.tolist()
num_features = df[features].select_dtypes(include=['int64', 'float64']).columns.tolist()

print(cat_features)
print(num_features)

In [None]:

# Fill missing values
df[cat_features] = df[cat_features].fillna("Unknown")
df[num_features] = df[num_features].fillna(df[num_features].median())

print(df)

In [None]:

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# Convert to CatBoost Pool format
train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
test_pool = Pool(X_test, label=y_test, cat_features=cat_features)


In [None]:

# Initialize CatBoost model
model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=8,
    loss_function='RMSE',
    cat_features=cat_features,
    verbose=100
)

In [None]:
# Train model
model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=50)

In [None]:
# Make predictions
y_pred = model.predict(X_test)

In [None]:

# Evaluate model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [None]:
print(f"RMSE: {rmse}")
print(f"R2 Score: {r2}")

In [None]:
validated = df[features]

In [None]:
# Make predictions on NewData without specifying cat_features
NewDataPrediction = model.predict(df[features])

In [None]:
import catboost
import numpy as np
import pandas as pd

# Get SHAP values, including the base value, for all records in NewData
shap_values = model.get_feature_importance(
    data=catboost.Pool(validated, cat_features=cat_features),
    type="ShapValues"
)

# Get predicted values
predicted_values = model.predict(validated)

# Convert SHAP values to DataFrame (include base value)
shap_columns = list(validated.columns) + ["base_value"]
shap_df = pd.DataFrame(shap_values, columns=shap_columns)

# Add predicted values column
shap_df["predicted_value"] = predicted_values

# Display SHAP values with predictions
shap_df  # Show SHAP values for the first few records including prediction




In [None]:
shap_df1 = shap_df.merge(Data_F[["Credits","Lead Id"]], left_index=True, right_index=True)

In [None]:
shap_df1

In [None]:
shap_df1.to_csv("../data/New_Scores_Artificial_Grass_Installation.csv", index=False)

In [None]:
# Save model
joblib.dump(model, "../../shared/models/artificial_grass.pkl")

print("Model saved successfully.")

In [None]:
# Save the filtered dataset
#df_results.to_csv("Scores_Actual_vs_Predicted.csv", index=False)