In [0]:
val labeledDf = spark.read
              .option("header", "true")
              .options(Map("inferSchema" -> "true", "delimiter" -> "\t"))
              .csv("file:///team5/data/LabeledFile.csv")

In [1]:
%python
import xgboost
print(xgboost.__version__)

In [2]:
%python
print(pickle.format_version)

In [3]:
%sh
pip freeze | grep scikit-learn


In [4]:
%sh
pip freeze > /team5/data/requirements.txt


In [5]:
%sh
ls /team5/data/models

In [6]:
%python
import pandas as pd

# Load the labeled dataset
file_path = 'file:///team5/data/LabeledFile.csv'
labeled_data = pd.read_csv(file_path,delimiter='\t')

# Display the first few rows to confirm
print(labeled_data.head())


In [7]:
%python
# Filter lines where risky = yes
risky_data = labeled_data[labeled_data['Risky'] == 'yes']
print(risky_data)

In [8]:
%python
labeled_data = labeled_data.sort_values(by=['N_SOUSCRIP', 'year', 'Risky'], ascending=[True, True, False])
labeled_data = labeled_data.drop_duplicates(subset=['N_SOUSCRIP'], keep='first')

In [9]:
%python
unique_delegations = labeled_data['delegation'].nunique()
print(f"Number of unique delegations: {unique_delegations}")


In [10]:
%python
print(labeled_data.head())


In [11]:
%python
###################
# Import required libraries
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, fbeta_score
import matplotlib.pyplot as plt



# Features to retain
features_to_use = [
    "puissance", "age_objet_assuree", "valeur_venale", "valeur_neuve",
    "Charge_utile", "usage", "anciennete", "activite", "classe",
    "delegation", "age_client", "civilite", "Risky"
]

# Filter only the required features
labeled_data = labeled_data[features_to_use]

# Define categorical and numerical columns
categorical_cols = ["usage", "activite", "delegation", "civilite"]
numerical_cols = ["puissance", "age_objet_assuree", "valeur_venale", "valeur_neuve", 
                  "Charge_utile", "anciennete","classe", "age_client"]

# Ensure all numerical columns are floats or integers
for col in numerical_cols:
    labeled_data[col] = pd.to_numeric(labeled_data[col], errors='coerce')

# Ensure categorical columns are properly set to 'category' dtype
for col in categorical_cols:
    labeled_data[col] = labeled_data[col].astype('category')

# Handle missing values: Drop rows with missing values in any critical column
labeled_data = labeled_data.dropna(subset=categorical_cols + numerical_cols)

# Define Features (X) and Target (y)
X = labeled_data[categorical_cols + numerical_cols]
y = (labeled_data["Risky"] == "No").astype(int)  # Binary: 1 for 'No', 0 for 'Yes'

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Calculate scale_pos_weight
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

# Initialize XGBoost Model
xgb = XGBClassifier(
    n_estimators=100,
    random_state=42,
    tree_method='hist',
    enable_categorical=True,
    scale_pos_weight=scale_pos_weight
)

# Train the model
xgb.fit(X_train, y_train)

# Predictions
y_pred = xgb.predict(X_test)
y_pred_prob = xgb.predict_proba(X_test)[:, 1]

# Evaluation metrics
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=["Yes (Risky)", "No (Non-Risky)"]))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_prob):.4f}")


# Adjust threshold
threshold = 0.4
y_pred_adjusted = (y_pred_prob >= threshold).astype(int)
print("\nClassification Report with Adjusted Threshold:")
print(classification_report(y_test, y_pred_adjusted, target_names=["Yes (Risky)", "No (Non-Risky)"]))
f2_score = fbeta_score(y_test, y_pred_adjusted, beta=2)
print(f"F2-Score: {f2_score:.4f}")


###################

#"final"
# Classification Report:
#                 precision    recall  f1-score   support

#   Yes (Risky)       0.19      0.63      0.29       852
# No (Non-Risky)       0.98      0.86      0.92     17271

#       accuracy                           0.85     18123
#      macro avg       0.58      0.75      0.60     18123
#   weighted avg       0.94      0.85      0.89     18123

# ROC-AUC Score: 0.8332

# Classification Report with Adjusted Threshold:
#                 precision    recall  f1-score   support

#   Yes (Risky)       0.23      0.54      0.32       852
# No (Non-Risky)       0.98      0.91      0.94     17271

#       accuracy                           0.89     18123
#      macro avg       0.60      0.72      0.63     18123
#   weighted avg       0.94      0.89      0.91     18123

# F2-Score: 0.9231



#just like supposed


In [12]:
%python
from sklearn.metrics import precision_recall_curve, fbeta_score

precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
f2_score = fbeta_score(y_test, y_pred, beta=2)

print(f"F2-Score: {f2_score:.4f}")
#F2-Score: 0.2554,most features
# F2-Score: 0.2888,selected oens only



#F2-Score: 0.8854


In [13]:
%python
############################################Final###########################
##################run,is kept
#trying class weights
# Define class weights
class_weights = {0: 1, 1: len(y) / (2 * sum(y))}  # Inverse class frequency as weight

# Train the model with class weights
xgb = XGBClassifier(
    n_estimators=100,
    random_state=42,
    tree_method='hist',
    enable_categorical=True,
    scale_pos_weight=class_weights[1]  # Weight for the minority class
)
xgb.fit(X_train, y_train)

# Evaluate
y_pred = xgb.predict(X_test)
y_pred_prob = xgb.predict_proba(X_test)[:, 1]
print("Classification Report with Cost-Sensitive Learning:")
print(classification_report(y_test, y_pred, target_names=["Yes (Risky)", "No (Non-Risky)"]))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_prob):.4f}")

#"final"
# Classification Report with Cost-Sensitive Learning:
#                 precision    recall  f1-score   support

#   Yes (Risky)       0.37      0.28      0.32       852
# No (Non-Risky)       0.96      0.98      0.97     17271

#       accuracy                           0.94     18123
#      macro avg       0.67      0.63      0.64     18123
#   weighted avg       0.94      0.94      0.94     18123

# ROC-AUC Score: 0.8240

In [14]:
%python
from sklearn.metrics import precision_recall_curve, fbeta_score

precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
f2_score = fbeta_score(y_test, y_pred, beta=2)

print(f"F2-Score: {f2_score:.4f}")
#F2-Score: 0.9742


In [15]:
%python
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Generate confusion matrix with reversed axis
cm = confusion_matrix(y_test, y_pred, labels=[1, 0])  # 1: No (Non-Risky), 0: Yes (Risky)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["No (Non-Risky)", "Yes (Risky)"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix (Reversed Axes)")
plt.show()


In [16]:
%python
import pickle
# Save the model using pickle
with open("xgboost_final_model.pkl", "wb") as file:
    pickle.dump(xgb, file)
print("Model saved as 'file:///team5/data/xgboost_final_model.pkl'")

In [17]:
%python
// another saving method
model.save_model("file:///team5/data/xgboost_model.json")


In [18]:
%python
#model_path= "file:///team5/data/xgboost_final_model.pkl"
model_path= "/team5/data/models/xgboost_final_model.pkl"
# model = pickle.load(open(model_path, "rb"))
# print(model.keys())  # Check the keys in the dictionary
# xgb_model = model.get('model')  # Adjust based on the actual key name


try:
     model = pickle.load(open(model_path, "rb"))
except FileNotFoundError:
     st.error("TEST!Model file not found. Ensure 'xgboost_final_model.pkl' is in the correct directory.")
print("Model type:", type(model))
try:
    example_input = np.array([
        [0,   # 'usage': Encoded value for "VP"
        3,   # 'activite': Encoded value for "RETRAITE"
        4,   # 'classe': Numerical or encoded value
        15,  # 'delegation': Encoded index
        1,   # 'civilite': Encoded value for "Mme"
        6,   # 'puissance': Numerical value
        5,   # 'age_objet_assuree': Numerical value
        20000, # 'valeur_venale': Numerical value
        25000, # 'valeur_neuve': Numerical value
        1.5,  # 'Charge_utile': Numerical value
        2,    # 'anciennete': Numerical value
        45    # 'age_client': Numerical value
        ]
    ])

    # Predict
    prediction = model.predict(example_input)
    print("Prediction:", prediction)

except Exception as e:
    print(f"Error: {e}")

#1=>risky?

In [19]:
%python
#model_path= "file:///team5/data/xgboost_final_model.pkl"
model_path= "/team5/data/models/xgboost_final_model.pkl"
# model = pickle.load(open(model_path, "rb"))
# print(model.keys())  # Check the keys in the dictionary
# xgb_model = model.get('model')  # Adjust based on the actual key name


try:
     model = pickle.load(open(model_path, "rb"))
except FileNotFoundError:
     st.error("TEST!Model file not found. Ensure 'xgboost_final_model.pkl' is in the correct directory.")
print("Model type:", type(model))
try:
    example_input = np.array([
        [0,   # 'usage': Encoded value for "VP"
        3,   # 'activite': Encoded value for "RETRAITE"
        4,   # 'classe': Numerical or encoded value
        15,  # 'delegation': Encoded index
        1,   # 'civilite': Encoded value for "Mme"
        7,   # 'puissance': Numerical value
        5,   # 'age_objet_assuree': Numerical value
        2000, # 'valeur_venale': Numerical value
        25000, # 'valeur_neuve': Numerical value
        1.5,  # 'Charge_utile': Numerical value
        2,    # 'anciennete': Numerical value
        18    # 'age_client': Numerical value
        ]
    ])

    # Predict
    prediction = model.predict(example_input)
    print("Prediction:", prediction)

except Exception as e:
    print(f"Error: {e}")

In [20]:
%python
# Get feature importances
feature_importances = xgb.feature_importances_
feature_names = categorical_cols + numerical_cols

# Display feature importances
print("\nFeature Importances:")
for name, importance in zip(feature_names, feature_importances):
    print(f"Feature: {name}, Importance: {importance:.4f}")

# Visualize feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_names, feature_importances, color='skyblue')
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Feature Importances")
plt.show()


In [21]:
%python
################################### def randomsearch
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
import logging

# Define parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2, 0.3],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'scale_pos_weight': [1, len(y_train[y_train == 0]) / len(y_train[y_train == 1])],
}

# Initialize the model
xgb = XGBClassifier(tree_method='hist', enable_categorical=True, random_state=42)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    scoring='roc_auc',
    n_iter=20,  # Number of combinations to try
    cv=2,       # 3-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1,
    )




In [22]:
%python
#trying subset,here the  imbalance is kept
###########################for finding params
from sklearn.model_selection import train_test_split

# Stratified sampling
X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(
    X, y, test_size=0.95, random_state=42, stratify=y
)


In [23]:
%python
# RandomizedSearchCV with subset data
random_search.fit(X_train_sub, y_train_sub)

# Best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Fitting 2 folds for each of 20 candidates, totalling 40 fits
# Best Parameters: {'subsample': 0.8, 'scale_pos_weight': 1, 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 1.0}

In [24]:
%python
import logging
from sklearn.utils import parallel_backend
#################################if trying ti find bestparas again,same as previous cell but with logs
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger()

# Use parallel_backend for better multiprocessing
with parallel_backend('threading'):
    logger.info("Starting RandomizedSearchCV...")
    random_search.fit(X_train_sub, y_train_sub)
    logger.info("Finished RandomizedSearchCV.")

# Best parameters
best_params = random_search.best_params_
logger.info(f"Best Parameters: {best_params}")

#INFO:root:Starting RandomizedSearchCV...
#Fitting 2 folds for each of 20 candidates, totalling 40 fits
#INFO:root:Finished RandomizedSearchCV.
#INFO:root:Best Parameters: {'subsample': 0.8, 'scale_pos_weight': 65.33240863587022, 'n_estimators': 200, #'min_child_weight': 1, 'max_depth': 9, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
#yay...



In [25]:
%python
########################################
xgb_tuned = XGBClassifier(
    subsample=0.8,
    scale_pos_weight=1,
    n_estimators=100,
    min_child_weight=5,
    max_depth=3,
    learning_rate=0.2,
    gamma=0.3,
    colsample_bytree=1.0,
    tree_method='hist',  # Use histogram-based method
    enable_categorical=True,
    random_state=42
)

xgb_tuned.fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
y_pred_prob = xgb_tuned.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=["Yes (Risky)", "No (Non-Risky)"]))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_prob):.4f}")

In [26]:
%python
from sklearn.metrics import fbeta_score

f2_score = fbeta_score(y_test, y_pred, beta=2)
print(f"F2-Score: {f2_score:.4f}")

In [27]:
%python
import joblib

# Save the model
joblib.dump(xgb_tuned, 'skeyenote-0.9.1/notebook/xgb_model.pkl')

// xgb_loaded = joblib.load('xgb_model.pkl')
