## Analyze data (Predict Category based on Symptoms)

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from  sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC

from sklearn.metrics import accuracy_score

In [3]:
# load only symptoms and Category Data
data = pd.read_excel("Train.xlsx")
data.rename(columns={'Subject':'Symptoms'}, inplace=True)
data.head()

Unnamed: 0,Symptoms,Category
0,Print error,Printer
1,Fwd: Digital Standees - Cr CTA,Data not clear
2,Guidance required in MS Office,MS office
3,Acquaint new users with Service Desk & various...,Application/Software
4,Crown Project office Printer not working,Printer


In [4]:
data.shape

(34438, 2)

In [5]:
X = data["Symptoms"]
y = data["Category"]

In [6]:
Tf_idf = TfidfVectorizer()
le = LabelEncoder()
XX = Tf_idf.fit_transform(X)
yy = le.fit_transform(y)

In [7]:
print('Symptoms Size : ', XX.shape)

Symptoms Size :  (34438, 8089)


In [8]:
## Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(XX, yy, test_size=0.2, random_state=42)

In [9]:
# Model Define
model = {
    "LogisticRegression": LogisticRegression(),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier()
    # "SVC": SVC(),  # Support Vector Classifier
}

In [10]:
accuracy_results = {}

# Loop through the models and evaluate each one
for name, model_instance in model.items():
    # Train the model
    model_instance.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model_instance.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_results[name] = accuracy
    print(f"{name}: Accuracy = {accuracy:.4f}")

LogisticRegression: Accuracy = 0.8898
DecisionTree: Accuracy = 0.8875
RandomForest: Accuracy = 0.8992


In [11]:
# Find the model with the highest accuracy
best_model_name = max(accuracy_results, key=accuracy_results.get)
best_accuracy = accuracy_results[best_model_name]
best_model = model[best_model_name]

print(f"\nBest Model: {best_model_name} with Accuracy = {best_accuracy:.4f}")


Best Model: RandomForest with Accuracy = 0.8992


In [12]:
# Define hyperparameters for grid search
param_grid = {
    'LogisticRegression': {
        'C': [0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    },
    'DecisionTree': {
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'RandomForest': {
        'n_estimators': [25, 50, 100, 150],
        'max_features': ['sqrt', 'log2'],
        'max_depth': [3, 6, 9],
        'max_leaf_nodes': [3, 6, 9]
    }
}

In [13]:
best_models = {}
for name, model_instance in model.items():
    print(f"Performing GridSearchCV for {name}")
    grid_search = GridSearchCV(model_instance, param_grid[name], cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Best model from grid search
    best_model = grid_search.best_estimator_
    best_models[name] = best_model
    print(f'Best Model for {name}: {best_model}')
    print(f'Best Model Accuracy for {name}: {best_model.score(X_test, y_test):.4f}')


Performing GridSearchCV for LogisticRegression




Best Model for LogisticRegression: LogisticRegression(C=10, penalty='l1', solver='liblinear')
Best Model Accuracy for LogisticRegression: 0.9059
Performing GridSearchCV for DecisionTree
Best Model for DecisionTree: DecisionTreeClassifier()
Best Model Accuracy for DecisionTree: 0.8878
Performing GridSearchCV for RandomForest
Best Model for RandomForest: RandomForestClassifier(max_depth=9, max_leaf_nodes=9, n_estimators=25)
Best Model Accuracy for RandomForest: 0.3859


In [14]:
# Find the model with the highest accuracy
best_model_name = max(best_models, key=lambda name: best_models[name].score(X_test, y_test))
best_accuracy = best_models[best_model_name].score(X_test, y_test)

print(f"\nBest Model: {best_model_name} with Accuracy = {best_accuracy:.4f}")


Best Model: LogisticRegression with Accuracy = 0.9059


### Fun with model with manually example 😁

In [58]:
new_subject = ["mic not working"]
new_subject_transformed = Tf_idf.transform(new_subject)
# predicted_category = best_model.predict(new_subject_transformed)
predicted_category = best_models[best_model_name].predict(new_subject_transformed)
predicted_category_label = le.inverse_transform(predicted_category)

print(f"The predicted category is: {predicted_category_label[0]}")

The predicted category is: Hardware


### Prediction on real data

In [18]:
new_data = pd.read_excel("New_Data.xlsx")
new_data.columns

Index(['Incident', 'Customer', 'Product', 'Status Code', 'Assignee',
       'Severity', 'Created By', 'Subject', 'Created', 'Updated',
       'Raised By Branch', 'Raised By Branch Region'],
      dtype='object')

In [19]:
new_data.head()

Unnamed: 0,Incident,Customer,Product,Status Code,Assignee,Severity,Created By,Subject,Created,Updated,Raised By Branch,Raised By Branch Region
0,ID1A,HFC IT,Service Request,80. Resolved,HFCITSECUNDERABAD,Non critical issue,MOHANC,601. Service Request-IceWarp- Not able to logi...,2024-09-26 10:34:37.157,2024-09-25 23:12:17.655,VIZAG,AP + TELNGANA
1,ID4F,HFC IT,Service Request,.,HFCITINDORE,Non critical issue,RIYAP,511. Service Request-Unable to access emails |...,2024-09-26 11:42:10.000,2024-09-26 11:42:10.471,INDORE,MP + CG
2,H7BT,HFC IT,Service Request,80. Resolved,HFCITSECUNDERABAD,Non critical issue,LINGAMPALLYSK,125. Service Request-Hardware Issue | 11-Sep-2024,2024-09-11 11:05:08.466,2024-09-25 23:09:45.919,NALGONDA,AP + TELNGANA
3,ID3Z,HFC IT,Service Request,10. Open,XHFCITJALGAON,Non critical issue,HIMANSHUR,142. Service Request-Printer Configure | 26-Se...,2024-09-26 11:33:57.801,2024-09-26 11:36:22.492,MANDSAUR,
4,ID2S,HFC IT,Service Request,80. Resolved,HFCITCHENNAI,Non critical issue,GOWTHAMANR,112. Service Request-Citrix | 26-Sep-2024,2024-09-26 11:04:22.549,2024-09-26 11:30:11.920,TRICHY,TAMILNADU


In [20]:
newtf = Tf_idf.transform(new_data['Subject'])

In [21]:
# Will predict both method
pred = best_models[best_model_name].predict(newtf)
newpred = best_models[best_model_name].predict(newtf)

In [22]:
# Inverse encode data
predicted = le.inverse_transform(pred)
newpredicted = le.inverse_transform(newpred)

In [40]:
# making dictionary for dataframe
data = {
    "Symptom": new_data["Subject"],
    "Predicted": predicted,
    "2nd Predicted": newpredicted
}


In [41]:
# save predected data in dataframe and show tope 5
df = pd.DataFrame(data)
df.head(5)

Unnamed: 0,Symptom,Predicted,2nd Predicted
0,601. Service Request-IceWarp- Not able to logi...,Application/Software,Application/Software
1,511. Service Request-Unable to access emails |...,Access Request,Access Request
2,125. Service Request-Hardware Issue | 11-Sep-2024,Hardware,Hardware
3,142. Service Request-Printer Configure | 26-Se...,Printer,Printer
4,112. Service Request-Citrix | 26-Sep-2024,Application/Software,Application/Software


In [42]:
# Will save only Grid Search CV method prediction because it's have better accuracy
# new_data["Predicted Category"] = predicted
new_data["GSCV Predicted Category"] = newpredicted

In [43]:
new_data.to_excel("Predict_cat.xlsx", index_label=False)