**Text classification** is a supervised learning task where the goal is to assign predefined categories to text documents.
This process involves transforming text data into numerical features that machine learning algorithms can process.
Vectorization - converting textual data into numeric representation. This can be done with GloveVector, tf-idf vector and other embedding methods such as OpenAIEmbedding, and other free Embedding
techniques such as HugginFaceEmbedding, BM25 Embedding,etc.
Then, the numeric vectors are fed into the model. In this notebook, we primarily focus on lightweight algorithms provided by scikit-learn and catboost.

In [1]:
import pandas as pd
df=pd.read_csv('data.csv')
df.head(3)


In [None]:
len(df)

9348

In [None]:
df['classname'].nunique()

27

In [None]:
labels=df['classname'].values

In [None]:
label2id={x:idx for idx,x in enumerate(labels)}
id2label={idx:x for idx,x in enumerate(labels)}

In [1]:
def label_num_value(x):
    return labels.index(x)

df['labels_num']=df['classname'].apply(label_num_value)

In [None]:
df['Classify_Text']=df['Text_Data']

In [1]:
work_df=df[['Document No.','Classify_Text','labels_num']]

In [None]:
work_df.isnull().sum()

Document No.     0
Classify_Text    0
labels_num       0
dtype: int64

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier


# Prepare the data
X = work_df['Classify_Text'].values
y = work_df['labels_num'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text using GloVe word embeddings
# Assuming you have already downloaded the GloVe embeddings file
glove_path = 'glove.6B.100d.txt'
embeddings_index = {}
with open(glove_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Map words to their corresponding GloVe embeddings
word_index = vectorizer.vocabulary_
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define the CatBoost classifier
classifier = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, loss_function='MultiClass')

# Train the classifier
classifier.fit(X_train_counts, y_train)

# Make predictions
y_pred = classifier.predict(X_test_counts)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


0:	learn: 3.1033339	total: 189ms	remaining: 3m 8s
1:	learn: 3.0409378	total: 299ms	remaining: 2m 29s
2:	learn: 2.9572699	total: 392ms	remaining: 2m 10s
3:	learn: 2.8641626	total: 488ms	remaining: 2m 1s
4:	learn: 2.7921128	total: 568ms	remaining: 1m 53s
5:	learn: 2.7483173	total: 663ms	remaining: 1m 49s
6:	learn: 2.6892638	total: 749ms	remaining: 1m 46s
7:	learn: 2.6432299	total: 826ms	remaining: 1m 42s
8:	learn: 2.6082361	total: 918ms	remaining: 1m 41s
9:	learn: 2.5913509	total: 1.01s	remaining: 1m 40s
10:	learn: 2.5698567	total: 1.1s	remaining: 1m 39s
11:	learn: 2.5502658	total: 1.19s	remaining: 1m 37s
12:	learn: 2.5280677	total: 1.27s	remaining: 1m 36s
13:	learn: 2.5109727	total: 1.36s	remaining: 1m 35s
14:	learn: 2.4878770	total: 1.48s	remaining: 1m 37s
15:	learn: 2.4705591	total: 1.56s	remaining: 1m 35s
16:	learn: 2.4455308	total: 1.65s	remaining: 1m 35s
17:	learn: 2.4306994	total: 1.74s	remaining: 1m 34s
18:	learn: 2.4170253	total: 1.84s	remaining: 1m 34s
19:	learn: 2.4061458	tota

Random Forest


In [None]:
# Define the Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest classifier
rf_classifier.fit(X_train_counts, y_train)

# Make predictions with Random Forest
rf_y_pred = rf_classifier.predict(X_test_counts)

# Calculate accuracy with Random Forest
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print("Random Forest Accuracy:", rf_accuracy)



Random Forest Accuracy: 0.704812834224599


Support Vector

In [None]:
from sklearn.svm import SVC
classifier = SVC()
# Train the Random Forest classifier
classifier.fit(X_train_counts, y_train)

# Make predictions with Random Forest
rf_y_pred = classifier.predict(X_test_counts)

# Calculate accuracy with Random Forest
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print("Support Vector Machine Accuracy:", rf_accuracy)

Support Vector Machine Accuracy: 0.6326203208556149


Adaboost

In [None]:
# Define the AdaBoost classifier
from sklearn.ensemble import AdaBoostClassifier
ab_classifier = AdaBoostClassifier(n_estimators=100, random_state=42)

# Train the AdaBoost classifier
ab_classifier.fit(X_train_counts, y_train)

# Make predictions with AdaBoost
ab_y_pred = ab_classifier.predict(X_test_counts)

# Calculate accuracy with AdaBoost
ab_accuracy = accuracy_score(y_test, ab_y_pred)
print("AdaBoost Accuracy:", ab_accuracy)



AdaBoost Accuracy: 0.14759358288770053


#Tfidf

In [None]:

X = work_df['Classify_Text'].values
y = work_df['labels_num'].values

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.13,random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))
# instantiate the vectorizer
vect = CountVectorizer()
vect.fit(x_train)
x_train_dtm = vect.transform(x_train)
x_test_dtm = vect.transform(x_test)

8132 8132
1216 1216


Random Forest with Tfidf

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

# Tokenize the text using TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=7000)  # Adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Define the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest classifier
rf_classifier.fit(X_train_tfidf, y_train)

# Make predictions with Random Forest
rf_y_pred = rf_classifier.predict(X_test_tfidf)

# Calculate accuracy with Random Forest
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print("Random Forest Accuracy:", rf_accuracy)


Random Forest Accuracy: 0.681283422459893


catboost


In [None]:
# Define the CatBoost classifier
classifier = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, loss_function='MultiClass')

# Train the classifier
classifier.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = classifier.predict(X_test_counts)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("catboost Accuracy:", accuracy)

0:	learn: 3.1069746	total: 331ms	remaining: 5m 30s
1:	learn: 2.9635623	total: 567ms	remaining: 4m 42s
2:	learn: 2.8918775	total: 802ms	remaining: 4m 26s
3:	learn: 2.8293932	total: 1.04s	remaining: 4m 19s
4:	learn: 2.7733406	total: 1.26s	remaining: 4m 11s
5:	learn: 2.7200574	total: 1.5s	remaining: 4m 8s
6:	learn: 2.6676995	total: 1.71s	remaining: 4m 3s
7:	learn: 2.6452845	total: 1.94s	remaining: 3m 59s
8:	learn: 2.6010912	total: 2.15s	remaining: 3m 56s
9:	learn: 2.5699269	total: 2.37s	remaining: 3m 54s
10:	learn: 2.5350950	total: 2.58s	remaining: 3m 51s
11:	learn: 2.5110986	total: 2.8s	remaining: 3m 50s
12:	learn: 2.4902261	total: 3.01s	remaining: 3m 48s
13:	learn: 2.4655603	total: 3.22s	remaining: 3m 46s
14:	learn: 2.4448873	total: 3.44s	remaining: 3m 45s
15:	learn: 2.4234539	total: 3.65s	remaining: 3m 44s
16:	learn: 2.4095446	total: 3.86s	remaining: 3m 43s
17:	learn: 2.3914418	total: 4.08s	remaining: 3m 42s
18:	learn: 2.3759511	total: 4.29s	remaining: 3m 41s
19:	learn: 2.3559637	total

Adaboost with Tfidf

In [None]:
from sklearn.ensemble import AdaBoostClassifier
adaboost_clf = AdaBoostClassifier()

# Train the classifier
adaboost_clf.fit(X_train_tfidf, y_train)

# Predict the test data
y_pred = adaboost_clf.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)



Accuracy: 0.32245989304812833


In [None]:
#GradientBoost with Tfidf

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
   max_depth=1, random_state=0)
# Train the classifier
clf.fit(X_train_tfidf, y_train)

# Predict the test data
y_pred = clf.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.3962566844919786
