In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

# Load the dataset
file_path = 'data.csv'
data = pd.read_csv(file_path)

# Preprocess the data: Text feature and Category label
X = data['Text']
y = data['Category']

# Convert the text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_transformed = vectorizer.fit_transform(X)

# Encode the labels (Trafficking = 1, Not Trafficking = 0)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_encoded, test_size=0.2, random_state=42)

# Train an XGBoost classifier
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
# report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

print(f"Accuracy: {accuracy * 100:.2f}%")
# print("\nClassification Report:\n", report)

# Function to predict if a statement is trafficking-related
def predict_trafficking(statement):
    statement_transformed = vectorizer.transform([statement])
    prediction = model.predict(statement_transformed)
    prediction_label = label_encoder.inverse_transform(prediction)
    return prediction_label[0]

# Example usage:
statement = "Can I get some weed, maybe an ounce?"
print(f"Prediction for the statement: '{statement}' is: {predict_trafficking(statement)}")


Accuracy: 90.00%
Prediction for the statement: 'Can I get some weed, maybe an ounce?' is: Trafficking


Parameters: { "use_label_encoder" } are not used.

