# Hosting vLLM on server

```
python -m vllm.entrypoints.openai.api_server --model HuggingFaceTB/SmolLM3-3B --dtype bfloat16 --gpu-memory-utilization 0.85 --max-model-len 2048 --port 9999 --tensor-parallel-size 1 --enable-prefix-caching --trust-remote-code

```

* HuggingFaceTB/SmolLM3-3B
* microsoft/Phi-3.5-mini-instruct
* tiiuae/Falcon3-3B-Instruct
* Qwen/Qwen2.5-Omni-7B
* google/gemma-3n-E4B-it
* meta-llama/Llama-3.2-3B-Instruct
* meta-llama/Llama-3.1-8B-Instruct
* Qwen/Qwen2.5-7B-Instruct

In [1]:
import os
from tqdm import tqdm
import pandas as pd

from src.config import *
from src.genai import GenAIClassifier
from src.baseline import train_df
from src.utils import (calculate_language_wise_metrics, calculate_metrics, calculate_overall_metrics, \
create_comparison_table, print_results, generate_detailed_report, get_random_examples, get_misclassified_examples)

✓ All random seeds set to 42
✓ All random seeds set to 42
training files: ['train_en.csv', 'train_es.csv', 'train_it.csv']
Total training samples: 2988
CLASS DISTRIBUTION

Overall:
  Class 0 (NOT_RECLAMATORY): 2560 (85.7%)
  Class 1 (RECLAMATORY): 428 (14.3%)
  Total: 2988

Per Language:
  EN: Class 0=938, Class 1=88, Total=1026
  ES: Class 0=743, Class 1=133, Total=876
  IT: Class 0=879, Class 1=207, Total=1086




# Zero-Shot-Learning: Chain-of-Thoughts Prompting

In [2]:
model_name = "HuggingFaceTB/SmolLM3-3B"
# model_name = "microsoft/Phi-3.5-mini-instruct"
# model_name = "tiiuae/Falcon3-3B-Instruct"
# model_name = "Qwen/Qwen2.5-Omni-7B"
# model_name = "google/gemma-3n-E4B-it"
# model_name = "meta-llama/Llama-3.2-3B-Instruct"
# model_name = "meta-llama/Llama-3.1-8B-Instruct"
# model_name = "Qwen/Qwen2.5-7B-Instruct"

# OOM Models
# model_name = "Qwen/Qwen2.5-7B-Instruct"
# model_name = "mistralai/Mistral-7B-Instruct-v0.3"
genai_classifier = GenAIClassifier(model_name=model_name)

### Prompt

In [3]:
print(genai_classifier.create_prompt_zero_shot(train_df.text[0], train_df.lang[0]))


            You are a linguistic expert specializing in LGBTQ+ discourse analysis.
            
            TASK: Classify whether LGBTQ+ terminology uses RECLAMATORY INTENT
            
            Strictly give final classification answer only, don't give any explanation.
            
            DEFINITIONS:
            - RECLAMATORY: Term used for identity affirmation, pride, empowerment, community solidarity
            - NOT_RECLAMATORY: Derogatory use, neutral mention, or unclear intent
            
            Tweet (Language: English): "I've never heard anyone use the word "faggot" and not sound like an insecure 13 year old. "
            
            ANALYSIS (Chain of Thought):
            
            1. TERM IDENTIFICATION: What LGBTQ+ terms appear?
            2. TONE & MARKERS: What emotional/linguistic indicators are present?
            3. CONTEXT: Is it community affirmation or external discourse?
            4. INTENT: Does it express reclamatory intent?
           

### Sample Query

In [4]:
result = genai_classifier.classify(train_df.text[0], train_df.lang[0], use_few_shot=False)
result

{'tweet': 'I\'ve never heard anyone use the word "faggot" and not sound like an insecure 13 year old. ',
 'language': 'en',
 'classification': 'NOT_RECLAMATORY',
 'classification_label': 0,
 'raw_response': '<think>\n\n</think>\nNOT_RECLAMATORY',
 'model': 'HuggingFaceTB/SmolLM3-3B',
 'prompt_type': 'zero-shot'}

### Running for all

In [None]:
predicted_labels = []
for row in tqdm(train_df.itertuples(index=False), total=len(train_df), desc="Classifying"):
    result = genai_classifier.classify(row.text, row.lang, use_few_shot=False)
    predicted_labels.append(result["classification_label"])


In [5]:
result_file = "train_cot_prompt.csv"
results_write_path = os.path.join(results_root, result_file)

In [None]:
if os.path.exists(results_write_path):
    result_df = pd.read_csv(results_write_path)
else:
    result_df = train_df[["id", "lang", "label"]]

result_df[model_name] = predicted_labels

result_df.to_csv(results_write_path, index=False)

### Analysis

In [6]:
result_df = pd.read_csv(results_write_path)
model_columns = [m for m in list(result_df.columns) if m not in ["id", "lang", "label"]]
model_names = {}
for model_column in model_columns:
    model_names[model_column] = model_column

In [7]:
df = result_df
    
overall_metrics = calculate_overall_metrics(df, model_columns, model_names)

language_metrics = calculate_language_wise_metrics(df, model_columns, model_names)

print_results(overall_metrics, language_metrics)

ranking = create_comparison_table(overall_metrics)

best_model = ranking.iloc[0]['model']
best_model_col = [k for k, v in model_names.items() if v == best_model][0]
generate_detailed_report(df, best_model_col, model_names)

OVERALL METRICS (All Languages)
 accuracy  precision   recall       f1                            model  n_samples
 0.589829   0.548119 0.589829 0.528832         HuggingFaceTB/SmolLM3-3B       2988
 0.608895   0.553459 0.608895 0.490063  microsoft/Phi-3.5-mini-instruct       2988
 0.554587   0.551341 0.554587 0.280033       tiiuae/Falcon3-3B-Instruct       2988
 0.539800   0.519664 0.539800 0.463701             Qwen/Qwen2.5-Omni-7B       2988
 0.537883   0.518870 0.537883 0.469295           google/gemma-3n-E4B-it       2988
 0.571904   0.538389 0.571904 0.516257 meta-llama/Llama-3.1-8B-Instruct       2988
 0.600805   0.550280 0.600805 0.507662         Qwen/Qwen2.5-7B-Instruct       2988


LANGUAGE-WISE METRICS

EN Language:
--------------------------------------------------------------------------------
                           model  accuracy  precision   recall       f1  n_samples
        HuggingFaceTB/SmolLM3-3B  0.672890   0.594967 0.672890 0.612180       1026
 microsoft/Phi-3.5-

# Few-Shot-Learning: Chain-of-Thoughts Prompting

In [8]:
negative_samples_model_name = best_model
negative_samples_model_name

'HuggingFaceTB/SmolLM3-3B'

In [9]:
cot_df = pd.read_csv(results_write_path)
cot_df = cot_df[["id", model_name]]
merged_df = pd.merge(train_df, cot_df, on='id', how='inner')

In [10]:
N_EXAMPLES_PER_CLASS = 2
PREDICTION_COLUMN = negative_samples_model_name

random_examples = get_random_examples(
        df=merged_df,
        n_examples_per_class=N_EXAMPLES_PER_CLASS
    )

misclassified_examples = get_misclassified_examples(
        df=merged_df,
        prediction_col=PREDICTION_COLUMN,
        n_examples_per_class=N_EXAMPLES_PER_CLASS
    )

EN: 2 class 0 + 2 class 1 = 4 total
ES: 2 class 0 + 2 class 1 = 4 total
IT: 2 class 0 + 2 class 1 = 4 total
EN: 2 misclassified class 0 + 2 misclassified class 1 = 4 total
ES: 2 misclassified class 0 + 2 misclassified class 1 = 4 total
IT: 2 misclassified class 0 + 2 misclassified class 1 = 4 total


### CoT + Random Examples

In [11]:
classifier_random = GenAIClassifier(model_name=model_name, examples_dict=random_examples)

In [12]:
print(classifier_random.create_prompt_few_shot(train_df.text[0], train_df.lang[0]))


            You are a linguistic expert specializing in LGBTQ+ discourse analysis.
            
            TASK: Classify whether LGBTQ+ terminology uses RECLAMATORY INTENT
            
            Strictly give final classification answer only, don't give any explanation.
            
            DEFINITIONS:
            - RECLAMATORY: Term used for identity affirmation, pride, empowerment, community solidarity
            - NOT_RECLAMATORY: Derogatory use, neutral mention, or unclear intent
            
            EXAMPLES:
            The following are some examples of LGBTQ+ tweets in English:
            
            Tweet: "Maybe you're really just joking, but I don't get why other people on this site like to mock those who identify as other genders. The "I identify as a car" jokes are as blatant and tired as nigger jokes. And for the most part, it's not your caricature coal miner from the rust belt. It's so-called liberal progressives who believe in things like gay rights/mar

In [13]:
result = classifier_random.classify(
        q_tweet=train_df.text[0],
        q_language=train_df.lang[0],
        use_few_shot=True
    )

result

{'tweet': 'I\'ve never heard anyone use the word "faggot" and not sound like an insecure 13 year old. ',
 'language': 'en',
 'classification': 'RECLAMATORY',
 'classification_label': 1,
 'raw_response': '<think>\n\n</think>\nClassification: RECLAMATORY',
 'model': 'HuggingFaceTB/SmolLM3-3B',
 'prompt_type': 'few-shot'}

In [None]:
predicted_labels = []
for row in tqdm(train_df.itertuples(index=False), total=len(train_df), desc="Classifying"):
    result = classifier_random.classify(row.text, row.lang, use_few_shot=True)
    predicted_labels.append(result["classification_label"])

In [14]:
results_write_path = os.path.join(results_root, f"train_fewshot_prompt_random_{N_EXAMPLES_PER_CLASS}.csv")

In [None]:
if os.path.exists(results_write_path):
    result_df = pd.read_csv(results_write_path)
else:
    result_df = train_df[["id", "lang", "label"]]

result_df[model_name] = predicted_labels

result_df.to_csv(results_write_path, index=False)

In [15]:
random_df = pd.read_csv(f"../results/train/train_fewshot_prompt_random_{N_EXAMPLES_PER_CLASS}.csv")
random_df = random_df.fillna(0)

model_columns = [m for m in list(random_df.columns) if m not in ["id", "lang", "label"]]
model_names = {}
for model_column in model_columns:
    model_names[model_column] = model_column

In [16]:
df = random_df
    
overall_metrics = calculate_overall_metrics(df, model_columns, model_names)

language_metrics = calculate_language_wise_metrics(df, model_columns, model_names)

print_results(overall_metrics, language_metrics)

ranking = create_comparison_table(overall_metrics)

best_model = ranking.iloc[0]['model']
best_model_col = [k for k, v in model_names.items() if v == best_model][0]
generate_detailed_report(df, best_model_col, model_names)

OVERALL METRICS (All Languages)
 accuracy  precision   recall       f1                            model  n_samples
 0.571326   0.536609 0.571326 0.423819 meta-llama/Llama-3.1-8B-Instruct       2988
 0.598009   0.548177 0.598009 0.489374         Qwen/Qwen2.5-7B-Instruct       2988
 0.544166   0.564862 0.544166 0.228566         HuggingFaceTB/SmolLM3-3B       2988


LANGUAGE-WISE METRICS

EN Language:
--------------------------------------------------------------------------------
                           model  accuracy  precision   recall       f1  n_samples
meta-llama/Llama-3.1-8B-Instruct  0.656244   0.549387 0.656244 0.435194       1026
        Qwen/Qwen2.5-7B-Instruct  0.728072   0.587301 0.728072 0.584289       1026
        HuggingFaceTB/SmolLM3-3B  0.621366   0.551500 0.621366 0.309072       1026

ES Language:
--------------------------------------------------------------------------------
                           model  accuracy  precision   recall       f1  n_samples
meta-ll

### Misclassifed Zero Shot Examples + CoT

In [17]:
classifier_misclass = GenAIClassifier(model_name=model_name, examples_dict=misclassified_examples)

In [18]:
result = classifier_misclass.classify(
        q_tweet=train_df.text[0],
        q_language=train_df.lang[0],
        use_few_shot=True
    )

result

{'tweet': 'I\'ve never heard anyone use the word "faggot" and not sound like an insecure 13 year old. ',
 'language': 'en',
 'classification': 'RECLAMATORY',
 'classification_label': 1,
 'raw_response': '<think>\n\n</think>\nClassification: RECLAMATORY',
 'model': 'HuggingFaceTB/SmolLM3-3B',
 'prompt_type': 'few-shot'}

In [None]:
predicted_labels = []
for row in tqdm(train_df.itertuples(index=False), total=len(train_df), desc="Classifying"):
    result = classifier_misclass.classify(row.text, row.lang, use_few_shot=True)
    predicted_labels.append(result["classification_label"])

In [19]:
results_write_path = os.path.join(results_root, f"train_fewshot_prompt_misclassy_{N_EXAMPLES_PER_CLASS}.csv")

In [None]:
if os.path.exists(results_write_path):
    result_df = pd.read_csv(results_write_path)
else:
    result_df = train_df[["id", "lang", "label"]]

result_df[model_name] = predicted_labels

result_df.to_csv(results_write_path, index=False)

In [20]:
misclass_df = pd.read_csv(f"../results/train/train_fewshot_prompt_misclassy_{N_EXAMPLES_PER_CLASS}.csv")
misclass_df = misclass_df.fillna(0)

model_columns = [m for m in list(misclass_df.columns) if m not in ["id", "lang", "label"]]
model_names = {}
for model_column in model_columns:
    model_names[model_column] = model_column

In [21]:
df = misclass_df
    
overall_metrics = calculate_overall_metrics(df, model_columns, model_names)

language_metrics = calculate_language_wise_metrics(df, model_columns, model_names)

print_results(overall_metrics, language_metrics)

ranking = create_comparison_table(overall_metrics)

best_model = ranking.iloc[0]['model']
best_model_col = [k for k, v in model_names.items() if v == best_model][0]
generate_detailed_report(df, best_model_col, model_names)

OVERALL METRICS (All Languages)
 accuracy  precision   recall       f1                            model  n_samples
 0.551639   0.526700 0.551639 0.409892 meta-llama/Llama-3.1-8B-Instruct       2988
 0.622795   0.560868 0.622795 0.515542         Qwen/Qwen2.5-7B-Instruct       2988
 0.523091   0.544956 0.523091 0.198224         HuggingFaceTB/SmolLM3-3B       2988


LANGUAGE-WISE METRICS

EN Language:
--------------------------------------------------------------------------------
                           model  accuracy  precision   recall       f1  n_samples
meta-llama/Llama-3.1-8B-Instruct  0.654124   0.548378 0.654124 0.448546       1026
        Qwen/Qwen2.5-7B-Instruct  0.745675   0.611808 0.745675 0.630595       1026
        HuggingFaceTB/SmolLM3-3B  0.560780   0.538394 0.560780 0.222398       1026

ES Language:
--------------------------------------------------------------------------------
                           model  accuracy  precision   recall       f1  n_samples
meta-ll