In [11]:
# Install the necessary libraries
!pip install pandas
!pip install openpyxl



In [12]:
# Import the necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [13]:
# Read the xlsx files
text_summary_data = pd.read_excel('text_summary_datasets_v2.xlsx')
training_data = pd.read_excel('training_data_v2.xlsx')

In [14]:
# Displaying the first few rows of the training data
training_data.head()

Unnamed: 0,Index,Category,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,...,dim_759,dim_760,dim_761,dim_762,dim_763,dim_764,dim_765,dim_766,dim_767,dim_768
0,0,0,-0.882278,-0.647234,0.050173,-0.448188,-0.175582,0.125284,-0.335781,-0.396106,...,0.56847,-0.326577,0.026089,-0.407658,-0.162295,-0.121949,-0.386429,0.135763,0.516049,0.731324
1,1,0,-0.702665,-0.462591,0.162085,-0.029182,-0.280842,0.047459,0.109864,-0.54021,...,0.24944,-0.422917,0.03382,-0.226271,-0.324386,-0.036914,-0.588373,-0.344278,0.329853,0.195897
2,2,0,-0.327257,-0.397209,0.035037,-0.064671,-0.435734,0.535525,0.134867,-0.213102,...,0.710835,-0.19151,-0.068998,-0.262279,-0.214397,0.095195,-0.503536,0.142249,0.206015,0.182094
3,3,0,-0.793734,-0.470964,-0.278644,-0.292047,-0.565868,0.546791,0.604674,-0.06919,...,0.387789,-0.596448,-0.291108,-0.320205,-0.362207,0.179917,-0.600026,-0.200465,0.776508,-0.155819
4,4,0,-0.27284,-0.556684,0.001737,-0.229906,-0.495732,0.176596,-0.141926,-0.352247,...,0.660122,-0.534026,0.347033,-0.279629,-0.397189,0.226515,-0.547098,0.431136,0.102714,0.422797


In [18]:
# Splitting the dataset into features (X) and target (y)
x = training_data.drop(columns=["Index","Category"])
y = training_data["Category"]

In [20]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [21]:
## Random Forest Classifier
# Initializing the Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Training the model
rf_clf.fit(X_train, y_train)

# Making predictions
rf_y_pred = rf_clf.predict(X_test)

# Evaluating the model
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_conf_matrix = confusion_matrix(y_test, rf_y_pred)
rf_class_report = classification_report(y_test, rf_y_pred)

print("Random Forest Classifier")
print(f"Accuracy: {rf_accuracy}")
print("Confusion Matrix:")
print(rf_conf_matrix)
print("Classification Report:")
print(rf_class_report)

Random Forest Classifier
Accuracy: 0.9
Confusion Matrix:
[[11  0  0  0]
 [ 1 17  2  0]
 [ 0  1 14  0]
 [ 0  2  0 12]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        11
           1       0.85      0.85      0.85        20
           2       0.88      0.93      0.90        15
           3       1.00      0.86      0.92        14

    accuracy                           0.90        60
   macro avg       0.91      0.91      0.91        60
weighted avg       0.90      0.90      0.90        60



In [22]:
## Support Vector Machine Classifier
# Initializing the SVM classifier
svm_clf = SVC(kernel='linear', random_state=42)

# Training the model
svm_clf.fit(X_train, y_train)

# Making predictions
svm_y_pred = svm_clf.predict(X_test)

# Evaluating the model
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_conf_matrix = confusion_matrix(y_test, svm_y_pred)
svm_class_report = classification_report(y_test, svm_y_pred)

print("\nSupport Vector Machine Classifier")
print(f"Accuracy: {svm_accuracy}")
print("Confusion Matrix:")
print(svm_conf_matrix)
print("Classification Report:")
print(svm_class_report)


Support Vector Machine Classifier
Accuracy: 0.9833333333333333
Confusion Matrix:
[[11  0  0  0]
 [ 1 19  0  0]
 [ 0  0 15  0]
 [ 0  0  0 14]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        11
           1       1.00      0.95      0.97        20
           2       1.00      1.00      1.00        15
           3       1.00      1.00      1.00        14

    accuracy                           0.98        60
   macro avg       0.98      0.99      0.98        60
weighted avg       0.98      0.98      0.98        60



**Random Forest Classifier**

Category 0:
* Precision: 0.92 indicates that when the classifier predicts a data point as belonging to this category, it is correct 92% of the time.
* Recall: 1.00 suggests that the classifier effectively captures all true instances of this category.
* F1-score: 0.96 balances precision and recall, providing a high performance metric.

Category 1:
* Precision: 0.85 signifies that when the classifier predicts this category, it is correct 85% of the time.
* Recall: 0.85 implies that the classifier captures 85% of the true instances of this category.
* F1-score: 0.85 reflects a balanced performance for precision and recall.

Category 2:
* Precision: 0.88 indicates that predictions for this category are correct 88% of the time.
* Recall: 0.93 suggests the classifier captures 93% of the true instances of this category.
* F1-score: 0.90 shows a strong balance between precision and recall.

Category 3:
* Precision: 1.00 indicates perfect accuracy in predicting this category.
* Recall: 0.86 suggests the classifier captures 86% of the true instances.
* F1-score: 0.92 balances precision and recall effectively.


**Support Vector Machine Classifier**

Category 0:
* Precision: 0.92 indicates that when the classifier predicts a data point as belonging to this category, it is correct 92% of the time.
* Recall: 1.00 suggests that the classifier effectively captures all true instances of this category.
* F1-score: 0.96 balances precision and recall, providing a high performance metric.

Category 1:
* Precision: 1.00 signifies that when the classifier predicts this category, it is correct 100% of the time.
* Recall: 0.95 implies that the classifier captures 95% of the true instances of this category.
* F1-score: 0.97 reflects a balanced performance for precision and recall.

Category 2:
* Precision: 1.00 indicates perfect accuracy in predicting this category.
* Recall: 1.00 suggests the classifier captures all true instances of this category.
* F1-score: 1.00 shows an excellent balance between precision and recall.

Category 3:
* Precision: 1.00 indicates perfect accuracy in predicting this category.
* Recall: 1.00 suggests the classifier captures all true instances of this category.
* F1-score: 1.00 balances precision and recall perfectly.

Summary
* Random Forest Classifier: Overall, the Random Forest classifier performs well, with high precision, recall, and F1-scores across all categories. However, there is a slight drop in precision for Category 1 and recall for Category 3.
* Support Vector Machine Classifier: The SVM classifier shows exceptional performance, with near-perfect precision, recall, and F1-scores across all categories. This indicates that SVM is highly effective for this particular dataset.
Comparing both classifiers, the SVM outperforms the Random Forest classifier in terms of overall accuracy and individual category metrics, making it a better choice for this dataset.