In [0]:
%python
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np



In [1]:
%python
# Load the labeled dataset
file_path = 'file:///team5/data/LabeledFile.csv'  # Replace with your actual file path
labeled_data = pd.read_csv(file_path, delimiter='\t')

# Display the first few rows to confirm
print("First few rows of the dataset after dropping columns:")
print(labeled_data.head())




In [2]:
%python
# Handle missing numeric values by filling with the column mean
numeric_cols = [ "puissance", "age_objet_assuree", 
    "valeur_venale", "valeur_neuve", "Charge_utile", 
    "anciennete", "classe", "age_client"
]
labeled_data[numeric_cols] = labeled_data[numeric_cols].fillna(labeled_data[numeric_cols].mean())

# Handle missing categorical values by filling with the most frequent value
categorical_cols = ["usage", "activite", "delegation", "civilite"]
labeled_data[categorical_cols] = labeled_data[categorical_cols].fillna(labeled_data[categorical_cols].mode().iloc[0])

# Replace infinite values with NaN and handle them
labeled_data.replace([np.inf, -np.inf], np.nan, inplace=True)


In [3]:
%python
# Iterate over numeric columns and calculate distinct counts
for column in numeric_cols:
    if column in labeled_data.columns:
        distinct_count = labeled_data[column].nunique()
        print(f"Distinct Count for {column}: {distinct_count}")
    else:
        print(f"Column {column} not found in DataFrame")

In [4]:
%python
# Iterate over categorical columns and calculate distinct counts
for column in categorical_cols:
    if column in labeled_data.columns:
        distinct_count = labeled_data[column].nunique()
        print(f"Distinct Count for {column}: {distinct_count}")
    else:
        print(f"Column {column} not found in DataFrame")

In [5]:
%python
# Loop through each column and perform the group by, count, and sort
for col in categorical_cols:
    print(f"Processing column: {col}")
    result = (
        labeled_data.groupby(col)
        .size()
        .reset_index(name="count")
        .sort_values(by="count", ascending=False)
    )
    
    # Display top 10 results for the current column
    print(result.head(260))
    print("\n" + "="*50 + "\n")  # Separator for better readability

In [6]:
%python
# Loop through each column and perform the group by, count, and sort
for col in numeric_cols:
    print(f"Processing column: {col}")
    result = (
        labeled_data.groupby(col)
        .size()
        .reset_index(name="count")
        .sort_values(by="count", ascending=False)
    )
    
    # Display top 10 results for the current column
    print(result.head(260))
    print("\n" + "="*50 + "\n")  # Separator for better readability

In [7]:
%python
 # Encode categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    labeled_data[col] = le.fit_transform(labeled_data[col])
    label_encoders[col] = le


In [8]:
%python
# Define predefined mappings for categorical columns
predefined_mappings = {
    "usage": {
        "VP": 0,
        "u1": 1,
        "moto": 2,
        "taxi": 3,
        "U2": 4,
        "engin": 5,
        "autre": 6,
        "louage": 7,
        "transport_rural": 8,
        "taxi_collectif": 9
    },
    "activite": {
        "EDUCATION_FORMATION": 0,
        "PROFESSIONS_MEDICALES": 1,
        "EMPLOYE": 2,
        "RETRAITE": 3,
        "ACTIVITES_COMMERCIALES": 4,
        "AGRICULTURE": 5,
        "RESIDENT_A_L'ETRANGER": 6,
        "ARTISAN": 7,
        "CORPS_ACTIFS": 8,
        "INGENIEUR": 9,
        "CHAUFFEUR": 10,
        "PARAMEDICAL": 11,
        "OUVRIER": 12,
        "TAXI_LOUAGE_TRASPORT_RURAL": 13,
        "ARCHITECTURE_BTP_IMMOBILIER": 14,
        "TECHNICIEN": 15,
        "GERANT_DIRIGEANT": 16,
        "PROFESSIONNEL_CONSULTANT_EXPERT": 17,
        "METIERS_LEGAUX": 18,
        "INFORMATIQUE": 19,
        "DIRECTEUR": 20,
        "TOURISME": 21,
        "AUTO_ECOLE": 22,
        "ACTIVITES_SPORTIVES": 23,
        "ACTIVITES_ARTISTIQUES": 24,
        "TRANSPORT_AEREEN": 25,
        "ETAT": 26,
        "TRANSPORT": 27,
        "ACTIVITES_FINACIAIRES_ET_BANCAIRES": 28,
        "JOURNALISME": 29,
        "DIPLOMATIE": 30,
        "ASSOCIATIONS_ONG": 31,
        "SANS_PROFESSION": 32,
        "ACTIVITES_INDUSTRIELLES": 33
    },
    # "classe": {
    #     "1.0": 0,
    #     "3.0": 1,
    #     "4.0": 2,
    #     "2.0": 3,
    #     "8.0": 4,
    #     "5.0": 5,
    #     "6.0": 6,
    #     "9.0": 7,
    #     "7.0": 8,
    #     "10.0": 9,
    #     "11.0": 10,
    #     "0.0": 11
    # },
    "civilite": {
        "Mr": 0,
        "Mme": 1,
        "Entreprise": 2,
        "mult_CT": 3,
        "Org": 4,
        "Couple": 5,
        "Etablissement": 6
    }
    # Add a mapping for "delegation" if necessary
}

# Encode categorical columns using predefined mappings
for col in categorical_cols:
    if col in predefined_mappings:
        labeled_data[col] = labeled_data[col].map(predefined_mappings[col])
    else:
        le = LabelEncoder()
        labeled_data[col] = le.fit_transform(labeled_data[col])
        label_encoders[col] = le


In [9]:
%python
# Scale numeric columns
scaler = StandardScaler()
labeled_data[numeric_cols] = scaler.fit_transform(labeled_data[numeric_cols])

In [10]:
%python
print(labeled_data[numeric_cols].isnull().sum())  # Counts of NaN values in numeric columns


In [11]:
%python
# Prepare features and labels
X = labeled_data[categorical_cols + numeric_cols]
y = labeled_data['Risky']  # Replace 'Risky' with the actual target column name

# Encode the target column
label_encoder_y = LabelEncoder()
y = label_encoder_y.fit_transform(y)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)


In [12]:
%python
# Train the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1234)
rf.fit(X_train, y_train)

print("Random Forest model trained successfully!")



In [13]:
%python
# Make predictions
y_pred = rf.predict(X_test)


In [14]:
%python
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")


In [15]:
%python
# Get feature importances
feature_importances = rf.feature_importances_
feature_names = categorical_cols + numeric_cols

# Display feature importances
print("\nFeature Importances:")
for name, importance in zip(feature_names, feature_importances):
    print(f"Feature: {name}, Importance: {importance:.4f}")

# Visualize feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_names, feature_importances, color='skyblue')
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Feature Importances")
plt.show()


In [16]:
%python
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# Calculate the F1 score
f1 = f1_score(y_test, y_pred)

# Display the F1 score
print(f"F1 Score: {f1:.4f}")

# Plot F1 score as a bar chart
plt.figure(figsize=(6, 4))
plt.barh(['F1 Score'], [f1], color='blue')
plt.xlim(0, 1)
plt.xlabel('F1 Score')
plt.title('F1 Score of the Model')
plt.show()



In [17]:
%python
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import seaborn as sns

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Calculate Precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision:.4f}")

# Calculate Recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall:.4f}")

# Calculate F1 Score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")

# Generate Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Plot Confusion Matrix using Seaborn
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# Calculate ROC Curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, rf.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)

print(f"ROC AUC: {roc_auc:.4f}")

# Plot ROC Curve
plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, color="darkorange", lw=2, label="ROC curve (area = %0.2f)" % roc_auc)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("Receiver Operating Characteristic (ROC)")
plt.legend(loc="lower right")
plt.show()


In [18]:
%python
# Function to make predictions with new data
def predict_new_data(rf, label_encoders, scaler, label_encoder_y, user_input_values):
    """
    Make predictions for new input data using the trained model
    
    Parameters:
    rf: Trained Random Forest model
    label_encoders: Dictionary of label encoders for categorical columns
    scaler: Fitted StandardScaler for numeric columns
    label_encoder_y: LabelEncoder for target variable
    user_input_values: Dictionary containing input values
    """
    # Convert input to DataFrame
    input_df = pd.DataFrame([user_input_values])
    
    # Define column groups (same as in training)
    numeric_cols = [
        "Prime", "puissance", "age_objet_assuree", 
        "valeur_venale", "valeur_neuve", "Charge_utile", 
        "anciennete", "classe", "age_client"
    ]
    categorical_cols = [
        "marque", "usage", "Type_renouvellement_police", "fractionnement"
    ]
    
    # Encode categorical variables
    for col in categorical_cols:
        input_df[col] = label_encoders[col].transform(input_df[col])
    
    # Scale numeric variables
    input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])
    
    # Make prediction
    prediction = rf.predict(input_df[categorical_cols + numeric_cols])
    prediction_proba = rf.predict_proba(input_df[categorical_cols + numeric_cols])
    
    # Convert prediction back to original label
    prediction_label = label_encoder_y.inverse_transform(prediction)
    
    return {
        'prediction': prediction_label[0],
        'probability': max(prediction_proba[0]) * 100
    }

# Example usage (after model training)
user_input_values = {
    'marque': 'PEUGEOT',
    'usage': 'engin',
    'Type_renouvellement_police': 'T',
    'fractionnement': '1',
    'Prime': 2,
    'puissance': 2,
    'age_objet_assuree': 6,
    'valeur_venale': 6,
    'valeur_neuve': 6,
    'Charge_utile': 4, 
    'anciennete': 3,
    'classe': 2,
    'age_client': 5
}

# Make prediction
result = predict_new_data(rf, label_encoders, scaler, label_encoder_y, user_input_values)

# Print results
print(f"\nPrediction for new data:")
print(f"Risk Category: {result['prediction']}")
print(f"Confidence: {result['probability']:.2f}%")
