In [0]:
%python
import pandas as pd
from sklearn.model_selection import train_test_split
from .preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np

In [1]:
%python
import sklearn
print(sklearn.__version__)


In [2]:
%sh
python3 --version



In [3]:
%sh
ls /usr/bin/python*


In [4]:
%sh
pip install scikit-learn --upgrade

In [5]:
%python
import sklearn
print(sklearn.__version__)


In [6]:
# Load the labeled dataset
file_path = 'file:///team5/data/LabeledFile.csv'  # Replace with your actual file path
labeled_data = pd.read_csv(file_path, delimiter='\t')

# Display the first few rows to confirm loading
print("First few rows of the dataset before processing:")
print(labeled_data.head())





In [7]:
%python
labeled_data = labeled_data.sort_values(by=['N_SOUSCRIP', 'year', 'Risky'], ascending=[True, True, False])
labeled_data = labeled_data.drop_duplicates(subset=['N_SOUSCRIP'], keep='first')

In [8]:
# Drop rows with missing numeric values
numeric_cols = [
    "puissance", "age_objet_assuree", "valeur_venale", "valeur_neuve",
    "Charge_utile", "anciennete", "classe", "age_client"
]
labeled_data = labeled_data.dropna(subset=numeric_cols)

# Drop rows with missing categorical values
categorical_cols = ["usage", "activite", "delegation", "civilite"]
labeled_data = labeled_data.dropna(subset=categorical_cols)

In [9]:
# # Predefined mappings for categorical columns
# predefined_mappings = {
#     # "usage": {
#     #     "VP": 0, "u1": 1, "moto": 2, "taxi": 3, "U2": 4, "engin": 5, "autre": 6,
#     #     "louage": 7, "transport_rural": 8, "taxi_collectif": 9
#     # },
#     # "activite": {
#     #     "EDUCATION_FORMATION": 0, "PROFESSIONS_MEDICALES": 1, "EMPLOYE": 2, "RETRAITE": 3, 
#     #     "ACTIVITES_COMMERCIALES": 4, "AGRICULTURE": 5, "RESIDENT_A_L'ETRANGER": 6, "ARTISAN": 7, 
#     #     "CORPS_ACTIFS": 8, "INGENIEUR": 9, "CHAUFFEUR": 10, "PARAMEDICAL": 11, "OUVRIER": 12,
#     #     "TAXI_LOUAGE_TRASPORT_RURAL": 13, "ARCHITECTURE_BTP_IMMOBILIER": 14, "TECHNICIEN": 15,
#     #     "GERANT_DIRIGEANT": 16, "PROFESSIONNEL_CONSULTANT_EXPERT": 17, "METIERS_LEGAUX": 18,
#     #     "INFORMATIQUE": 19, "DIRECTEUR": 20, "TOURISME": 21, "AUTO_ECOLE": 22,
#     #     "ACTIVITES_SPORTIVES": 23, "ACTIVITES_ARTISTIQUES": 24, "TRANSPORT_AEREEN": 25, "ETAT": 26,
#     #     "TRANSPORT": 27, "ACTIVITES_FINACIAIRES_ET_BANCAIRES": 28, "JOURNALISME": 29, "DIPLOMATIE": 30,
#     #     "ASSOCIATIONS_ONG": 31, "SANS_PROFESSION": 32, "ACTIVITES_INDUSTRIELLES": 33
#     # },

#     # "civilite": {
#     #     "Mr": 0, "Mme": 1, "Entreprise": 2, "mult_CT": 3, "Org": 4, "Couple": 5,
#     #     "Etablissement": 6
#     # }
# }

# # Encode categorical columns using predefined mappings
# for col, mapping in predefined_mappings.items():
#     if col in labeled_data.columns:
#         # Replace using mapping and fill unmatched values with -1
#         labeled_data[col] = labeled_data[col].map(mapping).fillna(-1).astype(int)

# # Display the first few rows after encoding
# print("First few rows of the dataset after encoding:")
# print(labeled_data.head())


In [10]:
%python
# Scale numeric columns
scaler = StandardScaler()
labeled_data[numeric_cols] = scaler.fit_transform(labeled_data[numeric_cols])

In [11]:
# Prepare features and labels
X = labeled_data[categorical_cols + numeric_cols].copy()  # Create a copy of the DataFrame
y = labeled_data['Risky']  # Replace 'Risky' with the actual target column name

# Encode categorical columns using predefined mappings
for col in predefined_mappings.keys():
    if col in X.columns:
        X[col] = X[col].map(predefined_mappings[col]).fillna(-1)  # Handle missing values

# Scale numeric columns
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# Encode the target column
label_encoder_y = LabelEncoder()
y = label_encoder_y.fit_transform(y)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1400)

# Display the shapes of the train and test sets to confirm the split
print(f"Training set shape: X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Test set shape: X_test: {X_test.shape}, y_test: {y_test.shape}")


In [12]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns in X_train
for col in categorical_cols:
    if col in X_train.columns:
        label_encoder = LabelEncoder()
        X_train[col] = label_encoder.fit_transform(X_train[col].astype(str))  # Convert to string before encoding

# Encode categorical columns in X_test (to match encoding in X_train)
for col in categorical_cols:
    if col in X_test.columns:
        label_encoder = LabelEncoder()
        X_test[col] = label_encoder.fit_transform(X_test[col].astype(str))  # Convert to string before encoding

# Train the RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200, max_depth=14, random_state=1400)
rf.fit(X_train, y_train)

print("Random Forest model trained successfully!")

# Now we can predict the test set
y_pred = rf.predict(X_test)

# Calculate accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {accuracy:.4f}")


In [13]:
%python
# Make predictions
y_pred = rf.predict(X_test)

In [15]:
%python
# Get feature importances
feature_importances = rf.feature_importances_
feature_names = categorical_cols + numeric_cols

# Display feature importances
print("\nFeature Importances:")
for name, importance in zip(feature_names, feature_importances):
    print(f"Feature: {name}, Importance: {importance:.4f}")

# Visualize feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_names, feature_importances, color='skyblue')
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Feature Importances")
plt.show()


In [16]:
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# Calculate the F1 score
f1 = f1_score(y_test, y_pred)

# Display the F1 score
print(f"F1 Score: {f1:.4f}")

# Plot F1 score as a bar chart
plt.figure(figsize=(6, 4))
plt.barh(['F1 Score'], [f1], color='blue')
plt.xlim(0, 1)
plt.xlabel('F1 Score')
plt.title('F1 Score of the Model')
plt.show()



In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import seaborn as sns

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Calculate Precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision:.4f}")

# Calculate Recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall:.4f}")

# Calculate F1 Score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")

# Generate Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Plot Confusion Matrix using Seaborn
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# Calculate ROC Curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, rf.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)

print(f"ROC AUC: {roc_auc:.4f}")

# Plot ROC Curve
plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, color="darkorange", lw=2, label="ROC curve (area = %0.2f)" % roc_auc)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("Receiver Operating Characteristic (ROC)")
plt.legend(loc="lower right")
plt.show()


In [18]:
from sklearn.metrics import classification_report

# Generate Classification Report
class_report = classification_report(y_test, y_pred, target_names=["Negative", "Positive"])
print("\nClassification Report:")
print(class_report)


In [19]:
%python
import pickle
# Save the model using pickle
with open("random_forest_final_model.pkl", "wb") as file:
    pickle.dump(rf, file)
print("Model saved as 'file:///team5/data/random_forest_final_model.pkl'")