<a href="https://colab.research.google.com/github/smitparmar1807/Emoji_Prediction_for_Text/blob/main/Emoji_Prediction_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary packages (no torchvision)
!pip install torch
!pip install transformers datasets scikit-learn


# Step 2: Create Dataset (write CSV inside Colab)
csv_data = """text,label
I love you,❤️
I am very happy,😄
Feeling sad today,😢
Let's party tonight,🎉
I'm so hungry,🍔
Feeling sleepy,😴
Good morning everyone,🌞
It’s raining outside,🌧️
Congratulations on your achievement,🏆
I feel sick,🤒
"""

# Save CSV
with open('emoji_dataset.csv', 'w') as f:
    f.write(csv_data)

# Step 3: Imports
import pandas as pd
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments

# Step 4: Load Dataset
df = pd.read_csv('emoji_dataset.csv')
print("Sample Dataset:\n", df)

# Step 5: Encode Labels (Emojis → Numbers)
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

# Step 6: Tokenization
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

class EmojiDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

dataset = EmojiDataset(df['text'].tolist(), df['label_encoded'].tolist())

# Step 7: Model Setup
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(le.classes_))

# Step 8: Training Arguments (Updated)
training_args = TrainingArguments(
    output_dir='./results',
    report_to="none",  # Disable WandB logging
    num_train_epochs=10,
    per_device_train_batch_size=8,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2
)


# Step 9: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Step 10: Train the Model
trainer.train()

# Step 11: Save Model and Tokenizer
model.save_pretrained('./emoji_model')
tokenizer.save_pretrained('./emoji_model')


Sample Dataset:
                                   text label
0                           I love you    ❤️
1                      I am very happy     😄
2                    Feeling sad today     😢
3                  Let's party tonight     🎉
4                        I'm so hungry     🍔
5                       Feeling sleepy     😴
6                Good morning everyone     🌞
7                 It’s raining outside    🌧️
8  Congratulations on your achievement     🏆
9                          I feel sick     🤒


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,2.229
20,1.9455


('./emoji_model/tokenizer_config.json',
 './emoji_model/special_tokens_map.json',
 './emoji_model/vocab.txt',
 './emoji_model/added_tokens.json',
 './emoji_model/tokenizer.json')

In [None]:
# Load model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained('./emoji_model')
tokenizer = DistilBertTokenizerFast.from_pretrained('./emoji_model')

# Predict function
def predict_emoji(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1)
    emoji = le.inverse_transform(prediction.numpy())[0]
    return emoji

# Test Predictions
print(predict_emoji("I am feeling super happy today!"))
print(predict_emoji("I'm so tired."))
print(predict_emoji("Congratulations!"))


😄
🍔
🏆
