In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

: 

In [None]:
# Step 2: Downloading nltk resources
nltk.download('punkt')
nltk.download('stopwords')

: 

In [None]:
# Step 3: Setting random seed
np.random.seed(0)

: 

In [None]:
# Step 4: Loading data and preprocessing
data = pd.read_excel('factoryReports.xlsx')
stop_words = set(stopwords.words('english'))

: 

In [None]:
def preprocess_text(text):
    # Tokenize
    words = word_tokenize(text)
    # Convert to lowercase
    words = [word.lower() for word in words]
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # Join words back to sentence
    return ' '.join(words)

data['Description'] = data['Description'].apply(preprocess_text)
data['Category'] = data['Category'].apply(preprocess_text)


: 

In [None]:
# Step 6: Applying TFIDF vectorizer and splitting data
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['Description'])
y = data['Category']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

: 

In [None]:
# Step 7: Training Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(x_train, y_train)

: 

In [None]:
# Step 8: Making predictions
y_pred = rf_classifier.predict(x_test)

: 

In [None]:
# Step 9: Calculating accuracy and plotting confusion matrix
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

: 

In [None]:
# Plotting confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=data['Category'].unique())
disp.plot()
classification_report(y_test, y_pred)
print(classification_report(y_test, y_pred))

# Plotting confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
plt.imshow(conf_matrix, cmap='Blues', interpolation='nearest')
plt.title('Confusion Matrix')
plt.colorbar()
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(np.arange(len(data['Category'].unique())), data['Category'].unique(), rotation=45)
plt.yticks(np.arange(len(data['Category'].unique())), data['Category'].unique())
for i in range(len(conf_matrix)):
    for j in range(len(conf_matrix[i])):
        plt.text(j, i, conf_matrix[i, j], ha='center', va='center', color='red')
plt.show()

: 