In [4]:
import pandas as pd

test_df = pd.read_csv('ML-Engineer_Task/task_test.csv', index_col=0)
test_df = test_df.drop(test_df.columns[0], axis=1)
test_df

Unnamed: 0,ID,UserID,Origin,Destination,Time,Income,Comment,Created_at,Label
0,1008520,1003983,631,1068,76,3450000,قابل اعتماد و حرفهای\n,2024-04-13 03:18:20.374006,0
1,1009643,1003594,611,862,16,560000,عالی\n,2024-04-16 02:53:36.484247,0
2,1008644,1003679,1018,618,40,1140000,ضعیف\n,2024-04-13 09:43:47.217599,0
3,1009012,1004158,602,842,18,750000,خوب\n,2024-04-14 09:29:33.465792,0
4,1009925,1003757,589,602,33,760000,افتضاح\n,2024-04-16 15:59:35.316854,0
...,...,...,...,...,...,...,...,...,...
1495,1009905,1003920,617,612,39,790000,همکاران راننده همیشه به دقت مسیر را بررسی میک...,2024-04-16 14:51:39.327021,0
1496,1009734,1003813,842,839,7,230000,این راننده از مهارت رانندگی بالا برخوردار بود\n,2024-04-16 07:51:33.164525,0
1497,1009466,1004636,846,594,3,220000,تجربه لذتبخش بود\n,2024-04-15 11:05:10.520287,0
1498,1009589,1004038,603,593,28,800000,خیلی بد\n,2024-04-15 17:23:21.632577,0


In [5]:
import pandas as pd

def add_end_at_to_dataframe(df):
    """
    Adds an 'End_at' column to the dataframe by calculating the end time 
    based on the 'Created_at' time and 'Time' duration in minutes.
    """
    df['Created_at'] = pd.to_datetime(df['Created_at'])
    df['End_at'] = df['Created_at'] + pd.to_timedelta(df['Time'], unit='m')
    return df

def add_time_details(df, column):
    """
    Adds detailed time information columns to the dataframe based on the specified datetime column.
    Columns added: day of the week, hour, and minute.
    """
    df[column] = pd.to_datetime(df[column])
    df[f"Day_Of_Week_{column}"] = df[column].dt.dayofweek
    df[f"Hour_{column}"] = df[column].dt.hour
    df[f"Minute_{column}"] = df[column].dt.minute
    return df

def convert_day_number_to_persian(day_number):
    """
    Converts a day number (0-6) to its corresponding Persian name.
    """
    persian_days = ["دوشنبه", "سه شنبه", "چهارشنبه", "پنج شنبه", "جمعه", "شنبه", "یکشنبه"]
    return persian_days[day_number]

def is_weekend(day):
    """
    Checks if the given Persian day name corresponds to a weekend (Thursday or Friday).
    """
    return day in ["پنج شنبه", "جمعه"]

def calculate_average_traffic_quality(start_hour, end_hour, traffic_quality_index):
    """
    Calculates the average traffic quality index between the start and end hours.
    """
    if start_hour == end_hour:
        return traffic_quality_index[start_hour]
    else:
        if start_hour > 20 and end_hour < 6:
            hours = list(range(start_hour, 24)) + list(range(0, end_hour + 1))
        else:
            hours = list(range(start_hour, end_hour + 1))
        traffic_qualities = [traffic_quality_index[hour] for hour in hours]
        return sum(traffic_qualities) / len(traffic_qualities)

# Predefined traffic quality index for each hour of the day
traffic_quality_index = {
    0: 4,  1: 3,  2: 3,  3: 3,  4: 3,  5: 3,  6: 5,  7: 21,
    8: 40, 9: 31, 10: 40, 11: 40, 12: 40, 13: 42, 14: 43, 15: 45,
    16: 54, 17: 65, 18: 76, 19: 72, 20: 60, 21: 45, 22: 21, 23: 9,
    24: 4
}

# Apply functions to the dataframe
test_df = add_end_at_to_dataframe(test_df)
test_df = add_time_details(test_df, 'Created_at')
test_df = add_time_details(test_df, 'End_at')
test_df['Day_Of_Week_Created_at'] = test_df['Day_Of_Week_Created_at'].apply(convert_day_number_to_persian)
test_df['Day_Of_Week_End_at'] = test_df['Day_Of_Week_End_at'].apply(convert_day_number_to_persian)
test_df['Is_Weekend_Created_at'] = test_df['Day_Of_Week_Created_at'].apply(is_weekend)
test_df['Is_Weekend_End_at'] = test_df['Day_Of_Week_End_at'].apply(is_weekend)
test_df['Average_Traffic_Quality'] = test_df.apply(
    lambda x: calculate_average_traffic_quality(x['Hour_Created_at'], x['Hour_End_at'], traffic_quality_index), 
    axis=1
)

# Rename and reorder columns
test_df = test_df[[
    "ID", "UserID", "Origin", "Destination", "Time", "Income", "Comment", 
    "Hour_Created_at", "Minute_Created_at", "Hour_End_at", "Minute_End_at", 
    "Day_Of_Week_End_at", "Is_Weekend_End_at", "Average_Traffic_Quality", "Label"
]]
test_df.rename(columns={"Day_Of_Week_End_at": "Day_Of_Week", "Is_Weekend_End_at": "Is_Weekend"}, inplace=True)
test_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.rename(columns={"Day_Of_Week_End_at": "Day_Of_Week", "Is_Weekend_End_at": "Is_Weekend"}, inplace=True)


Unnamed: 0,ID,UserID,Origin,Destination,Time,Income,Comment,Hour_Created_at,Minute_Created_at,Hour_End_at,Minute_End_at,Day_Of_Week,Is_Weekend,Average_Traffic_Quality,Label
0,1008520,1003983,631,1068,76,3450000,قابل اعتماد و حرفهای\n,3,18,4,34,شنبه,False,3.0,0
1,1009643,1003594,611,862,16,560000,عالی\n,2,53,3,9,سه شنبه,False,3.0,0
2,1008644,1003679,1018,618,40,1140000,ضعیف\n,9,43,10,23,شنبه,False,35.5,0
3,1009012,1004158,602,842,18,750000,خوب\n,9,29,9,47,یکشنبه,False,31.0,0
4,1009925,1003757,589,602,33,760000,افتضاح\n,15,59,16,32,سه شنبه,False,49.5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,1009905,1003920,617,612,39,790000,همکاران راننده همیشه به دقت مسیر را بررسی میک...,14,51,15,30,سه شنبه,False,44.0,0
1496,1009734,1003813,842,839,7,230000,این راننده از مهارت رانندگی بالا برخوردار بود\n,7,51,7,58,سه شنبه,False,21.0,0
1497,1009466,1004636,846,594,3,220000,تجربه لذتبخش بود\n,11,5,11,8,دوشنبه,False,40.0,0
1498,1009589,1004038,603,593,28,800000,خیلی بد\n,17,23,17,51,دوشنبه,False,65.0,0


In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased-sentiment-snappfood")
model = AutoModelForSequenceClassification.from_pretrained("HooshvareLab/bert-fa-base-uncased-sentiment-snappfood")

# Define a function to calculate sentiment score
def get_sentiment_score(comment):
    if comment.strip() == '':
        return 0.5  # Neutral score for empty comments
    inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    scores = outputs.logits.detach().numpy()
    probabilities = np.exp(scores) / (1 + np.exp(scores))
    sentiment_score = probabilities[0][1]  # Probability of negative sentiment
    return float(sentiment_score)

# Apply the sentiment score function to the comments
test_df['Sentiment_Score'] = test_df['Comment'].apply(get_sentiment_score)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Sentiment_Score'] = test_df['Comment'].apply(get_sentiment_score)


In [7]:
test_df.to_csv('ML-Engineer_Task/task_test_with_sentiment.csv')

In [7]:
import pandas as pd

day_of_week_one_hot = {
    "شنبه": [0, 0, 0, 0, 0, 0, 1],
    "یکشنبه": [0, 0, 0, 0, 0, 1, 0],
    "دوشنبه": [0, 0, 0, 0, 1, 0, 0],
    "سه شنبه": [0, 0, 0, 1, 0, 0, 0],
    "چهارشنبه": [0, 0, 1, 0, 0, 0, 0],
    "پنج شنبه": [0, 1, 0, 0, 0, 0, 0],
    "جمعه": [1, 0, 0, 0, 0, 0, 0]
}

test_df = pd.read_csv('ML-Engineer_Task/task_test_with_sentiment.csv', index_col=0)
test_df.drop(columns="Comment", inplace=True)
test_df["is_origin_and_destination_same"] = test_df["Origin"] == test_df["Destination"]

# Adding new columns for each day of the week
days = ["شنبه", "یکشنبه", "دوشنبه", "سه شنبه", "چهارشنبه", "پنج شنبه", "جمعه"]
for i, day in enumerate(days):
    test_df[f'Day_{i}'] = test_df['Day_Of_Week'].apply(lambda x: day_of_week_one_hot[x][i])

# Convert boolean columns to integers
test_df["is_origin_and_destination_same"] = test_df["is_origin_and_destination_same"].astype(int)
test_df["Is_Weekend"] = test_df["Is_Weekend"].astype(int)

test_df

Unnamed: 0,ID,UserID,Origin,Destination,Time,Income,Hour_Created_at,Minute_Created_at,Hour_End_at,Minute_End_at,...,Label,Sentiment_Score,is_origin_and_destination_same,Day_0,Day_1,Day_2,Day_3,Day_4,Day_5,Day_6
0,1008520,1003983,631,1068,76,3450000,3,18,4,34,...,0,0.058138,0,0,0,0,0,0,0,1
1,1009643,1003594,611,862,16,560000,2,53,3,9,...,0,0.059922,0,0,0,0,1,0,0,0
2,1008644,1003679,1018,618,40,1140000,9,43,10,23,...,0,0.550547,0,0,0,0,0,0,0,1
3,1009012,1004158,602,842,18,750000,9,29,9,47,...,0,0.080259,0,0,0,0,0,0,1,0
4,1009925,1003757,589,602,33,760000,15,59,16,32,...,0,0.912539,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,1009905,1003920,617,612,39,790000,14,51,15,30,...,0,0.123307,0,0,0,0,1,0,0,0
1496,1009734,1003813,842,839,7,230000,7,51,7,58,...,0,0.087159,0,0,0,0,1,0,0,0
1497,1009466,1004636,846,594,3,220000,11,5,11,8,...,0,0.059502,0,0,0,0,0,1,0,0
1498,1009589,1004038,603,593,28,800000,17,23,17,51,...,0,0.932965,0,0,0,0,0,1,0,0


In [8]:
X = test_df.drop(columns=["ID", "Label", "Day_Of_Week"])
y = test_df["Label"]

In [9]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

def report_metrics(y_true, y_pred_prob):
    # Binarize the predicted probabilities to get the predicted labels
    threshold = 0.5
    y_pred = (y_pred_prob >= threshold).astype(int)
    
    # Calculate various metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred_prob)
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    # Create a results dictionary
    metrics = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC-AUC': roc_auc,
        'Confusion Matrix': conf_matrix
    }
    
    # Print the results
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    
    return metrics

# Load the best model
best_model = xgb.XGBClassifier()
best_model.load_model('best_xgb_model.json')

# Example usage with your best model
y_pred_prob = best_model.predict_proba(X)[:, 1]
metrics = report_metrics(y, y_pred_prob)

Accuracy: 0.9353
Precision: 0.0886
Recall: 0.2188
F1 Score: 0.1261
ROC-AUC: 0.7573
Confusion Matrix:
[[1396   72]
 [  25    7]]
