In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [2]:
# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Load dataset
df = pd.read_csv(r"C:\Users\Dell\Desktop\Artificial Inteligence\pythonProject\dataset.csv")

In [4]:
# Function for text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

In [5]:
# Apply preprocessing to text data
df["text"] = df["text"].apply(preprocess_text)

In [6]:
# Encode labels
le = LabelEncoder()
df["label"] = le.fit_transform(df["label"])

In [7]:
# Split data into training and testing sets
text_train, text_test, target_train, target_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

In [8]:
# Ensure the text is converted to string format
text_train = [str(text) for text in text_train]
text_test = [str(text) for text in text_test]

In [9]:
# Define a pipeline with TfidfVectorizer and SVM
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.75, ngram_range=(1, 2))),
    ('classifier', SVC(kernel='linear', C=1.0))
])

In [10]:
# Perform grid search to find the best parameters
parameters = {
    'vectorizer__max_df': [0.5, 0.75, 1.0],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'classifier__C': [0.1, 1, 10]
}

In [11]:
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(text_train, target_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [12]:
# Best model from grid search
best_model = grid_search.best_estimator_

In [13]:
# Predictions
predictions_train = best_model.predict(text_train)
predictions_test = best_model.predict(text_test)

In [14]:
# Accuracy train
accuracy = accuracy_score(target_train, predictions_train)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [15]:
# Accuracy test
accuracy = accuracy_score(target_test, predictions_test)
print("Accuracy:", accuracy)

Accuracy: 0.99


In [16]:
# Function to classify new text
def classify_text(new_text, model):
    new_text = preprocess_text(new_text)
    prediction = model.predict([new_text])
    return prediction[0]

In [17]:
# Map numeric labels to original class labels
label_map = {index: label for index, label in enumerate(le.classes_)}

In [18]:
# Classify new texts
new_text1 = "Our agency excels in delivering real business value through data science"
prediction1 = classify_text(new_text1, best_model)
print("Prediction for new_text1:", label_map[prediction1])

Prediction for new_text1: Agencies


In [19]:
new_text2 = "Any recommendations for top data science conferences or meetups?"
prediction2 = classify_text(new_text2, best_model)
print("Prediction for new_text2:", label_map[prediction2])

Prediction for new_text2: Students


In [20]:
new_text3 = "Just started working on a project involving data science for predictive maintenance. The insights are promising!"
prediction3 = classify_text(new_text3, best_model)
print("Prediction for new_text3:", label_map[prediction3])

Prediction for new_text3: Freelancers


In [21]:
new_text4 = "Looking for advice on the best online data science courses that offer hands-on projects. Any recommendations?"
prediction4 = classify_text(new_text4, best_model)
print("Prediction for new_text4:", label_map[prediction4])

Prediction for new_text4: Students


In [22]:
new_text5 = "Our data science firm specializes in providing actionable insights to drive your business forward. Contact us today!"
prediction5 = classify_text(new_text5, best_model)
print("Prediction for new_text5:", label_map[prediction5])

Prediction for new_text5: Agencies


In [23]:
new_text6 = "Taking a data science certification and the real-world case studies included are incredibly insightful."
prediction6 = classify_text(new_text6, best_model)
print("Prediction for new_text6:", label_map[prediction6])

Prediction for new_text6: Courses
