In [1]:
# importing libraries

import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [2]:
# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Load dataset
df = pd.read_csv(r"C:\Users\Dell\Desktop\Artificial Inteligence\pythonProject\dataset.csv")

In [4]:
# Function for text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

In [5]:
# Apply preprocessing to text data
df["text"] = df["text"].apply(preprocess_text)

In [6]:
# Encode labels
le = LabelEncoder()
df["label"] = le.fit_transform(df["label"])

In [7]:
# Split data into training and testing sets
text_train, text_test, target_train, target_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

In [8]:
# Ensure the text is converted to string format
text_train = [str(text) for text in text_train]
text_test = [str(text) for text in text_test]

# model training

## XGBoost

In [9]:
# Define a pipeline with TfidfVectorizer and XGBoost
pipeline_XGB = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.75, ngram_range=(1, 2))),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
])

In [10]:
# Perform grid search to find the best parameters
parameters_XGB = {
    'vectorizer__max_df': [0.5, 0.75, 1.0],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'classifier__max_depth': [3, 5, 7],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.2]
}

In [11]:
grid_search_XGB = GridSearchCV(pipeline_XGB, parameters_XGB, cv=5, n_jobs=-1, verbose=1)
grid_search_XGB.fit(text_train, target_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


Parameters: { "use_label_encoder" } are not used.



In [12]:
# Best model from grid search
best_model_XGB = grid_search_XGB.best_estimator_

In [13]:
# Predictions
predictions_train_XGB = best_model_XGB.predict(text_train)
predictions_test_XGB = best_model_XGB.predict(text_test)

In [14]:
# Accuracy train
accuracy_train_XGB = accuracy_score(target_train, predictions_train_XGB)
print("Accuracy:", accuracy_train_XGB)

Accuracy: 1.0


In [15]:
# Accuracy test
accuracy_test_XGB = accuracy_score(target_test, predictions_test_XGB)
print("Accuracy:", accuracy_test_XGB)

Accuracy: 0.97


In [16]:
result = []
 
def Final_result(model_name, acc_score_training, acc_score_testing):
     if model_name not in result:
            result.append([model_name, (acc_score_training*100),(acc_score_testing*100)])
            final_df = pd.DataFrame(result, columns=['Model Name', 'accurary score for training', 'accuracy score for testing'])
            return final_df

In [17]:
Final_result("XGBoost", accuracy_train_XGB, accuracy_test_XGB)

Unnamed: 0,Model Name,accurary score for training,accuracy score for testing
0,XGBoost,100.0,97.0


## Random Forest

In [18]:
# Define a pipeline with TfidfVectorizer and Random Forest
pipeline_RF = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.75, ngram_range=(1, 2))),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [19]:
# Perform grid search to find the best parameters
parameters_RF = {
    'vectorizer__max_df': [0.5, 0.75, 1.0],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10]
}

In [20]:
grid_search_RF = GridSearchCV(pipeline_RF, parameters_RF, cv=5, n_jobs=-1, verbose=1)
grid_search_RF.fit(text_train, target_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


In [21]:
# Best model from grid search
best_model_RF = grid_search_RF.best_estimator_

In [22]:
# Predictions
predictions_train_RF = best_model_RF.predict(text_train)
predictions_test_RF = best_model_RF.predict(text_test)

In [23]:
# Accuracy train
accuracy_train_RF = accuracy_score(target_train, predictions_train_RF)
print("Accuracy:", accuracy_train_RF)

Accuracy: 1.0


In [24]:
# Accuracy test
accuracy_test_RF = accuracy_score(target_test, predictions_test_RF)
print("Accuracy:", accuracy_test_RF)

Accuracy: 0.985


In [25]:
Final_result("Random Forest", accuracy_train_RF, accuracy_test_RF)

Unnamed: 0,Model Name,accurary score for training,accuracy score for testing
0,XGBoost,100.0,97.0
1,Random Forest,100.0,98.5


## Suport Vector Machine

In [26]:
# Define a pipeline with TfidfVectorizer and SVM
pipeline_SVM = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.75, ngram_range=(1, 2))),
    ('classifier', SVC(kernel='linear', C=1.0))
])

In [27]:
# Perform grid search to find the best parameters
parameters_SVC = {
    'vectorizer__max_df': [0.5, 0.75, 1.0],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'classifier__C': [0.1, 1, 10]
}

In [28]:
grid_search_SVM = GridSearchCV(pipeline_SVM, parameters_SVC, cv=5, n_jobs=-1, verbose=1)
grid_search_SVM.fit(text_train, target_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [29]:
# Best model from grid search
best_model_SVM = grid_search_SVM.best_estimator_

In [30]:
# Predictions
predictions_train_SVM = best_model_SVM.predict(text_train)
predictions_test_SVM = best_model_SVM.predict(text_test)

In [31]:
# Accuracy train
accuracy_train_SVM = accuracy_score(target_train, predictions_train_SVM)
print("Accuracy:", accuracy_train_SVM)

Accuracy: 1.0


In [32]:
# Accuracy test
accuracy_test_SVM = accuracy_score(target_test, predictions_test_SVM)
print("Accuracy:", accuracy_test_SVM)

Accuracy: 0.99


In [33]:
Final_result("Suport Vector machine", accuracy_train_SVM, accuracy_test_SVM)

Unnamed: 0,Model Name,accurary score for training,accuracy score for testing
0,XGBoost,100.0,97.0
1,Random Forest,100.0,98.5
2,Suport Vector machine,100.0,99.0


## Multinomial Naive Bayes

In [34]:
# Define a pipeline with TfidfVectorizer and MultinomialNB
pipeline_NB = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.75, ngram_range=(1, 2))),
    ('classifier', MultinomialNB())
])

In [35]:
# Perform grid search to find the best parameters
parameters_NB = {
    'vectorizer__max_df': [0.5, 0.75, 1.0],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'classifier__alpha': [0.01, 0.1, 1.0]
}

In [36]:
grid_search_NB = GridSearchCV(pipeline_NB, parameters_NB, cv=5, n_jobs=-1, verbose=1)
grid_search_NB.fit(text_train, target_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [37]:
# Best model from grid search
best_model_NB = grid_search_NB.best_estimator_

In [38]:
# Predictions
predictions_train_NB = best_model_NB.predict(text_train)
predictions_test_NB = best_model_NB.predict(text_test)

In [39]:
# Accuracy train
accuracy_train_NB = accuracy_score(target_train, predictions_train_NB)
print("Accuracy:", accuracy_train_NB)

Accuracy: 1.0


In [40]:
# Accuracy test
accuracy_test_NB = accuracy_score(target_test, predictions_test_NB)
print("Accuracy:", accuracy_test_NB)

Accuracy: 0.99


In [41]:
Final_result("Multinomial Naive Bayes", accuracy_train_NB, accuracy_test_NB)

Unnamed: 0,Model Name,accurary score for training,accuracy score for testing
0,XGBoost,100.0,97.0
1,Random Forest,100.0,98.5
2,Suport Vector machine,100.0,99.0
3,Multinomial Naive Bayes,100.0,99.0


# user defined function

In [42]:
# Function to classify new text
def classify_text(new_text, model):
    new_text = preprocess_text(new_text)
    prediction = model.predict([new_text])
    return prediction[0]

In [43]:
# Map numeric labels to original class labels
label_map = {index: label for index, label in enumerate(le.classes_)}

In [45]:
# Classify new texts
new_text1 = "Our agency excels in delivering real business value through data science"
prediction1 = classify_text(new_text1, best_model_NB)
print("Prediction for new_text1:", label_map[prediction1])

Prediction for new_text1: Agencies


In [47]:
new_text2 = "Any recommendations for top data science conferences or meetups?"
prediction2 = classify_text(new_text2, best_model_NB)
print("Prediction for new_text2:", label_map[prediction2])

Prediction for new_text2: Students


In [49]:
new_text3 = "Just started working on a project involving data science for predictive maintenance. The insights are promising!"
prediction3 = classify_text(new_text3, best_model_NB)
print("Prediction for new_text3:", label_map[prediction3])

Prediction for new_text3: Freelancers


In [51]:
new_text4 = "Looking for advice on the best online data science courses that offer hands-on projects. Any recommendations?"
prediction4 = classify_text(new_text4, best_model_NB)
print("Prediction for new_text4:", label_map[prediction4])

Prediction for new_text4: Students


In [53]:
new_text5 = "Our data science firm specializes in providing actionable insights to drive your business forward. Contact us today!"
prediction5 = classify_text(new_text5, best_model_NB)
print("Prediction for new_text5:", label_map[prediction5])

Prediction for new_text5: Agencies


In [55]:
new_text6 = "Taking a data science certification and the real-world case studies included are incredibly insightful."
prediction6 = classify_text(new_text6, best_model_NB)
print("Prediction for new_text6:", label_map[prediction6])

Prediction for new_text6: Courses


In [66]:
new_text7 = "our highly trained staff, they are the so great they can do anything" #they are best in the business
prediction7 = classify_text(new_text7, best_model_NB)
print("Prediction for new_text7:", label_map[prediction7])

Prediction for new_text7: Courses


In [67]:
new_text8 = "The data science course's emphasis on hands-on learning has been crucial in developing my skills."
prediction8 = classify_text(new_text8, best_model_NB)
print("Prediction for new_text8:", label_map[prediction8])

Prediction for new_text8: Courses


In [68]:
new_text9 = "Developed a model for customer churn prediction. The results have been very insightful!"
prediction9 = classify_text(new_text9, best_model_NB)
print("Prediction for new_text9:", label_map[prediction9])

Prediction for new_text9: Freelancers


In [69]:
new_text10 = "i want to gain knowledge about data science, what is the best place to go"
prediction10 = classify_text(new_text10, best_model_NB)
print("Prediction for new_text10:", label_map[prediction10])

Prediction for new_text10: Students


In [70]:
new_text11 = "i have developed a model, that classify text using ml algorithms"                                  
prediction11 = classify_text(new_text11, best_model_NB)
print("Prediction for new_text11:", label_map[prediction11])

Prediction for new_text11: Freelancers
