<a href="https://colab.research.google.com/github/ramkumardeepak774/Text_Classification_Flair-vs-Fasttext/blob/main/text_Classification_fasttext_vs_flair.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Classification of email using Fasttext**

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.

In [2]:
import os
print(os.getcwd())

/content


In [None]:
from google.colab import files
email_train = files.upload()
email_test = files.upload()

In [None]:
import io
email_train_data = pd.read_csv(io.BytesIO(email_train["idfc-email-train.csv"]))
email_test_data = pd.read_csv(io.BytesIO(email_test["idfc-email-test.csv"]))

In [None]:
email_train_data.head()
email_test_data.head()

In [None]:
print(len(email_train_data))


In [None]:
#learn unique category 
print(email_train_data.SubCategory.unique())

In [None]:
# convert string labels to integers for classification
email_train_data['labels'] = pd.factorize(email_train_data.SubCategory)[0]
email_test_data['labels'] = pd.factorize(email_test_data.SubCategory)[0]



In [None]:
email_train_data.reset_index(drop=True, inplace=True)
email_test_data.reset_index(drop=True, inplace=True)
email_train_data.sample(frac = 1)
email_test_data.sample(frac =1)

In [None]:
email_train_data.rename(columns = {'Subject Description':'Description'}, inplace = True)
email_test_data.rename(columns = {'Subject Description':'Description'}, inplace = True)

In [None]:
#Data prepare for fasttext format for train
email_train_data["label_format"]=0
for i in range(len(email_train_data)):
    email_train_data.label_format[i]="__label__"+str(email_train_data.SubCategory[i])+" "+str(email_train_data["Description"][i])

In [None]:
#Data prepare for fasttext format for test
email_test_data["label_format"]=0
for i in range(len(email_test_data)):
    email_test_data.label_format[i]="__label__"+str(email_test_data.SubCategory[i])+" "+str(email_test_data["Description"][i])

In [None]:
email_test_data.head()

In [None]:
email_train_data.label_format.to_csv('fasttext_train.txt',index=None,header=None)


In [None]:
email_test_data.label_format.to_csv('fasttext_test.txt',index=None,header=None)


In [None]:
!pip install fasttext

In [None]:
import fasttext

In [None]:
#train classifier model
model = fasttext.train_supervised('fasttext_train.txt',epoch=50,lr=0.05,label_prefix='__label__',dim=300)

In [None]:
#test fasttext model
model.test('fasttext_test.txt')

In [None]:
result = model.test('fasttext_train.txt')
validation = model.test('fasttext_test.txt')


# DISPLAY ACCURACY OF TRAINED MODEL
text_line =  "accuracy:" + str(result[1])  + ",validation:" + str(validation[1]) + '\n' 
print(text_line)

**Classification using Flair**

In [None]:
#testing on same data set
email_train_data.head()

In [None]:
email_test_data.head()

In [None]:
email_train_data.columns

In [None]:
!pip install flair

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus


In [None]:
from flair.models import TextClassifier
from flair.data import Sentence
from flair.datasets import ClassificationCorpus


In [None]:
# define data paths
data_folder = "/content"
train_file = "idfc-email-train.csv"
test_file = "idfc-email-test.csv"

In [None]:
column_name_map = {0:"SubCategory",1:"Subject Description"}


In [None]:
# load corpus containing training, test and dev data
corpus: Corpus = ClassificationCorpus(data_folder, column_name_map,
                                      test_file='fasttext_test.txt',
                                      train_file='fasttext_train.txt',
                                      label_type = 'SubCategory'
                                      )

In [None]:
# 2. what label do we want to predict?
label_type = 'SubCategory'

# 3. create the label dictionary
label_dict = corpus.make_label_dictionary(label_type=label_type)


In [None]:
import torch
from torch.optim.lr_scheduler import OneCycleLR

from flair.data import Corpus
from flair.datasets import TREC_6
from flair.embeddings import TransformerDocumentEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

In [None]:
# 4. initialize transformer document embeddings (many models are available)
document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True)

In [None]:
# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, label_type=label_type)

In [None]:
# 6. initialize trainer with AdamW optimizer
trainer = ModelTrainer(classifier, corpus)

In [None]:
# train model
trainer.train('/content',
              learning_rate=0.5,
              mini_batch_size=32,
              max_epochs=10)

In [None]:
# evaluate model
classifier = TextClassifier.load('/content/final-model.pt')

# create example sentence
sentence = Sentence('i am request to credit cards')

# predict class and print
classifier.predict(sentence)

print(sentence.labels)