In [0]:
%python
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
print(pd.__version__)
print(sklearn.__version__)


In [1]:
%python
# Load the file specifying tab as the delimiter
data = pd.read_csv("file:///team5/data/LabeledFile.csv", delimiter="\t")

# Display the first few rows of the dataset
print(data.head())



In [2]:
%python
# Identify categorical columns
categorical_cols = ['N_OBJET_ASS', 'marque', 'carrosserie', 'energie', 'usage', 'gouvernorat', 
                    'activite', 'delegation', 'civilite', 'sexe', 'centre', 'direction_regionale', 
                    'type_vehicule', 'Type_renouvellement_police', 'fractionnement', 
                    'nombre_fractions', 'IsToutRisque']

# Convert categorical columns to 'category' dtype
data[categorical_cols] = data[categorical_cols].astype('category')

# Encode categorical columns with category codes
for col in categorical_cols:
    data[col] = data[col].cat.codes
    
# Handle missing and infinite values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(-1, inplace=True)  # Replace NaN with -1

In [3]:
%python

X = data.iloc[:, :-1].values 
Y = data.iloc[:,  -1].values 
# Check for missing values and handle them if necessary
print(data.isnull().sum())  # Replace with appropriate imputation if needed


In [4]:
%python
# Split into 80% training and 20% testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y) 


X_train.shape, X_test.shape

In [5]:
%python
# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, Y_train)


In [6]:
%python
# Make predictions on the test data
Y_pred = rf_model.predict(X_test)


In [7]:
%python
# Calculate F1 Score, Precision, and Recall
f1 = f1_score(Y_test, Y_pred, average="weighted")
precision = precision_score(Y_test, Y_pred, average="weighted")
recall = recall_score(Y_test, Y_pred, average="weighted")

# Print metrics
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
