In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd


import pandas as pd
df = pd.read_excel(r'D:\DA\Innotrat LABS\Intent or Context Analysis\new_generated_context.xlsx')
texts = df['text']
labels = df['label']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Convert text data into TF-IDF features
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Define the hyperparameter grid
#param_grid = [{'max_depth': list(range(10, 15)), 'max_features': list(range(0,10))}]
param_grid = {
    'max_depth': [None,1,3,5],
    'min_samples_split': [2,5,10]
}

# Initialize the DecisionTreeClassifier
classifier = DecisionTreeClassifier()

# Initialize the GridSearchCV
grid_search = GridSearchCV(classifier, param_grid, cv = 10, scoring='accuracy')

# Train the classifier with grid search
grid_search.fit(X_train, y_train)

# Make predictions on the test set
predictions = grid_search.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, predictions)

# Print the best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy)

Best Parameters: {'max_depth': None, 'min_samples_split': 10}
Accuracy: 0.3181818181818182


In [2]:
new_text = 'How do you handle quality issues or non-conforming PCBs during the manufacturing process? Can you share your corrective action and preventive action (CAPA) processes for maintaining high-quality standards?'

test_X = vectorizer.transform([new_text])
predictions = grid_search.predict(test_X)
print(predictions)

['How to fabricate']


In [3]:
new_text_2 = 'I am interested in manufacturing PCBs for my project. Attached are the gerber and design files. Kindly provide a quotation for approximately 100 units. Also, please let me know the estimated manufacturing timeline'

test_X_2 = vectorizer.transform([new_text_2])
predictions = grid_search.predict(test_X_2)
print(predictions)

['Who can help me fabricate']


In [4]:
new_text_3 = 'Request for PCB Manufacturing - Urgent Project'

test_X_3 = vectorizer.transform([new_text_3])
predictions = grid_search.predict(test_X_3)
print(predictions)

['Help me fabricate']


In [5]:
new_text_4 = 'Hi Team, I need a PCB project which I need urgent delivery. I am sharing my details kindly let me know the next process'

test_X_4 = vectorizer.transform([new_text_4])
predictions = grid_search.predict(test_X_4)
print(predictions)

['Can you fabricate']


# Final Model Version 1.0

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

df = pd.read_excel(r'D:\DA\Innotrat LABS\Intent or Context Analysis\new_generated_context.xlsx')
texts = df['text']
labels = df['label']

texts = texts.str.lower()
labels = labels.str.lower()

texts = texts.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
vectorizer = CountVectorizer()

# Convert text data into TF-IDF features
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Define the hyperparameter grid
#param_grid = [{'max_depth': list(range(10, 15)), 'max_features': list(range(0,10))}]
param_grid = {
    'max_depth': [None,1,3,5],
    'min_samples_split': [2,5,10]
}

# Initialize the DecisionTreeClassifier
classifier = DecisionTreeClassifier()

# Initialize the GridSearchCV
grid_search = GridSearchCV(classifier, param_grid, cv = 10, scoring='accuracy')

# Train the classifier with grid search
grid_search.fit(X_train, y_train)

# Make predictions on the test set
predictions = grid_search.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, predictions)

# Print the best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy)

Best Parameters: {'max_depth': None, 'min_samples_split': 10}
Accuracy: 0.4318181818181818


# Model and Vectorizer Saving

In [8]:
import joblib
joblib.dump(grid_search.best_estimator_, 'decision_tree_model.pkl')

['decision_tree_model.pkl']

In [9]:
joblib.dump(vectorizer, 'vectorizer_context.pkl')

['vectorizer_context.pkl']

# Model and Vectorizer loading

In [None]:
import joblib
#load the model
model = joblib.load('decision_tree_model.pkl')
#load vectorizer
vectorizer = joblib.load('vectorizer_context.pkl')

In [2]:
new_text = 'How do you handle quality issues or non-conforming PCBs during the manufacturing process? Can you share your corrective action and preventive action (CAPA) processes for maintaining high-quality standards?'

test_X = vectorizer.transform([new_text])
predictions = grid_search.predict(test_X)
print(predictions)

['how to fabricate']


In [3]:
new_text_2 = 'I am interested in manufacturing PCBs for my project. Attached are the gerber and design files. Kindly provide a quotation for approximately 100 units. Also, please let me know the estimated manufacturing timeline'

test_X_2 = vectorizer.transform([new_text_2])
predictions = grid_search.predict(test_X_2)
print(predictions)

['i need this design to fabricate']


In [4]:
new_text_3 = 'Request for PCB Manufacturing - Urgent Project'

test_X_3 = vectorizer.transform([new_text_3])
predictions = grid_search.predict(test_X_3)
print(predictions)

['who can help me fabricate']


In [5]:
new_text_4 = 'Hi Team, I need a PCB project which I need urgent delivery. I am sharing my details kindly let me know the next process'

test_X_4 = vectorizer.transform([new_text_4])
predictions = grid_search.predict(test_X_4)
print(predictions)

['who can help me fabricate']


In [6]:
new_text_5 = 'Hi Team, My order number is 1234. I want to know the status of my order'
test_X_5 = vectorizer.transform([new_text_5])
predictions = grid_search.predict(test_X_5)
print(predictions)

['who can help me fabricate']


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer #CountVectorizer 
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Read the data
df = pd.read_excel(r'D:\DA\Innotrat LABS\Intent or Context Analysis\new_generated_context.xlsx')
texts = df['text']
labels = df['label']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Convert text data into TF-IDF features
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Initialize the SVM classifier
svm_classifier = SVC()

# Train the SVM classifier
svm_classifier.fit(X_train, y_train)

# Make predictions on the test set
predictions = svm_classifier.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


Accuracy: 0.36363636363636365


In [3]:
new_text = 'How do you handle quality issues or non-conforming PCBs during the manufacturing process? Can you share your corrective action and preventive action (CAPA) processes for maintaining high-quality standards?'

test_X = vectorizer.transform([new_text])
predictions = svm_classifier.predict(test_X)
print(predictions)

['How to fabricate']


In [4]:
new_text_2 = 'I am interested in manufacturing PCBs for my project. Attached are the gerber and design files. Kindly provide a quotation for approximately 100 units. Also, please let me know the estimated manufacturing timeline'

test_X_2 = vectorizer.transform([new_text_2])
predictions = svm_classifier.predict(test_X_2)
print(predictions)

['Help me fabricate']


In [5]:
new_text_3 = 'Request for PCB Manufacturing - Urgent Project'

test_X_3 = vectorizer.transform([new_text_3])
predictions = svm_classifier.predict(test_X_3)
print(predictions)

['Who can help me fabricate']


In [6]:
new_text_4 = 'Hi Team, I need a PCB project which I need urgent delivery. I am sharing my details kindly let me know the next process'

test_X_4 = vectorizer.transform([new_text_4])
predictions = svm_classifier.predict(test_X_4)
print(predictions)

['Who can help me fabricate']


In [7]:
new_text_5 = 'Hi Team, My order number is 1234. I want to know the status of my order'
test_X_5 = vectorizer.transform([new_text_5])
predictions = svm_classifier.predict(test_X_5)
print(predictions)

['Who can help me fabricate']


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer #CountVectorizer 
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd

# Read the data
df = pd.read_excel(r'D:\DA\Innotrat LABS\Intent or Context Analysis\new_generated_context.xlsx')
texts = df['text']
labels = df['label']

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Initialize the vectorizer
vectorizer = TfidfVectorizer()

# Initialize the SVM classifier
classifier = SVC()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Initialize the GridSearchCV
grid_search = GridSearchCV(classifier, param_grid, cv = 5, scoring='accuracy')

# Train the classifier with grid search
grid_search.fit(X_train, y_train)

# Make predictions on the test set
predictions = grid_search.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, predictions)

# Print the best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy)

Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy: 0.4090909090909091


In [4]:
new_text = 'How do you handle quality issues or non-conforming PCBs during the manufacturing process? Can you share your corrective action and preventive action (CAPA) processes for maintaining high-quality standards?'

test_X = vectorizer.transform([new_text])
predictions = grid_search.predict(test_X)
print(predictions)

['How to fabricate']


In [5]:
new_text_2 = 'I am interested in manufacturing PCBs for my project. Attached are the gerber and design files. Kindly provide a quotation for approximately 100 units. Also, please let me know the estimated manufacturing timeline'

test_X_2 = vectorizer.transform([new_text_2])
predictions = grid_search.predict(test_X_2)
print(predictions)

['Help me fabricate']


In [6]:
new_text_3 = 'Request for PCB Manufacturing - Urgent Project'

test_X_3 = vectorizer.transform([new_text_3])
predictions = grid_search.predict(test_X_3)
print(predictions)

['Who can help me fabricate']


In [7]:
new_text_4 = 'Hi Team, I need a PCB project which I need urgent delivery. I am sharing my details kindly let me know the next process'

test_X_4 = vectorizer.transform([new_text_4])
predictions = grid_search.predict(test_X_4)
print(predictions)

['Help me fabricate']


In [8]:
new_text_5 = 'Hi Team, My order number is 1234. I want to know the status of my order'
test_X_5 = vectorizer.transform([new_text_5])
predictions = grid_search.predict(test_X_5)
print(predictions)

['Who can help me fabricate']


In [2]:
from sklearn.feature_extraction.text import  CountVectorizer #TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd

# Read the data
df = pd.read_excel(r'D:\DA\Innotrat LABS\Intent or Context Analysis\new_generated_context.xlsx')
texts = df['text']
labels = df['label']

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Initialize the vectorizer
vectorizer = CountVectorizer()

# Initialize the SVM classifier
classifier = SVC()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Initialize the GridSearchCV
grid_search = GridSearchCV(classifier, param_grid, cv = 5, scoring='accuracy')

# Train the classifier with grid search
grid_search.fit(X_train, y_train)

# Make predictions on the test set
predictions = grid_search.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, predictions)

# Print the best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy)

Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Accuracy: 0.3181818181818182


In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from nltk.tokenize import word_tokenize
import pandas as pd
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Load the dataset
df = pd.read_excel(r'D:\DA\Innotrat LABS\Intent or Context Analysis\new_generated_context.xlsx')
texts = df['text']
labels = df['label']

texts = texts.str.lower()
labels = labels.str.lower()

texts = texts.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Define the tokenizer function
def tokenizer(text):
    return word_tokenize(text)

# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=tokenizer)),
    ('classifier', SVC())
])

# Define the hyperparameter grid
param_grid = {
    'vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': ['scale', 'auto']
}

# Initialize the GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

# Train the classifier with grid search
grid_search.fit(X_train, y_train)

# Make predictions on the test set
predictions = grid_search.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, predictions)

# Print the best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy)


Best Parameters: {'classifier__C': 1, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear', 'vectorizer__ngram_range': (1, 1)}
Accuracy: 0.45454545454545453


In [2]:
new_text = 'How do you handle quality issues or non-conforming PCBs during the manufacturing process? Can you share your corrective action and preventive action (CAPA) processes for maintaining high-quality standards?'
best_model = grid_search.best_estimator_
predicted_label = best_model.predict([new_text])
print("Predicted Label:", predicted_label)


Predicted Label: ['how to fabricate']


In [3]:
new_text_2 = 'I am interested in manufacturing PCBs for my project. Attached are the gerber and design files. Kindly provide a quotation for approximately 100 units. Also, please let me know the estimated manufacturing timeline'
best_model = grid_search.best_estimator_
predicted_label = best_model.predict([new_text_2])
print("Predicted Label:", predicted_label)

Predicted Label: ['who can help me fabricate']


In [11]:
new_text_3 = 'Request for PCB Manufacturing - Urgent Project'
best_model = grid_search.best_estimator_
predicted_label = best_model.predict([new_text_3])
print("Predicted Label:", predicted_label)

Predicted Label: ['Who can help me fabricate']


In [12]:
new_text_4 = 'Hi Team, I need a PCB project which I need urgent delivery. I am sharing my details kindly let me know the next process'
best_model = grid_search.best_estimator_
predicted_label = best_model.predict([new_text_4])
print("Predicted Label:", predicted_label)

Predicted Label: ['Who can help me fabricate']


In [13]:
new_text_5 = 'Hi Team, My order number is 1234. I want to know the status of my order'
best_model = grid_search.best_estimator_
predicted_label = best_model.predict([new_text_5])
print("Predicted Label:", predicted_label)

Predicted Label: ['Who can help me fabricate']
