In [None]:
!pip install sentence-transformers scikit-learn


Collecting sentence-transformers
  Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/156.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.5.1


In [None]:
!pip install nltk scikit-learn pandas




In [None]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ensure necessary NLTK downloads
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Loading CSV files
clos_df = pd.read_csv('NLP_Course_CLOs.csv')
questions_df = pd.read_csv('NLP_100_Questions.csv')

# Text Preprocessing Function
def preprocess_text(text):
    # Cleaning Text
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    # Tokenization
    tokens = nltk.word_tokenize(text)
    # Normalization: Lowercasing and Removing Stopwords
    tokens = [token.lower() for token in tokens if token.lower() not in stopwords.words('english')]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

# Applying Preprocessing to the DataFrames
clos_df['Processed_Description'] = clos_df['Description'].apply(preprocess_text)
questions_df['Processed_Question'] = questions_df['Question'].apply(preprocess_text)

# Example of vectorizing the processed descriptions
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(clos_df['Processed_Description'])

print("CLOs DataFrame with Processed Descriptions:\n", clos_df.head())
print("\nQuestions DataFrame with Processed Questions:\n", questions_df.head())
print("\nTF-IDF Matrix Shape:", tfidf_matrix.shape)


CLOs DataFrame with Processed Descriptions:
                                                  CLO  \
0  CLO1: Understand the Foundations of Natural La...   
1              CLO2: Text Preprocessing and Cleaning   
2        CLO3: Language Modeling and Text Generation   
3     CLO4: Word Embeddings and Text Representations   
4     CLO5: Machine Translation and Multilingual NLP   

                                         Description  \
0  Grasp the basic concepts, history, and develop...   
1  Learn techniques for preprocessing text data, ...   
2  Understand the principles of language modeling...   
3  Gain knowledge on representing text as vectors...   
4  Learn about the algorithms and models used for...   

                               Processed_Description  
0  grasp basic concept history development nlp un...  
1  learn technique preprocessing text data includ...  
2  understand principle language modeling learn b...  
3  gain knowledge representing text vector using ...  
4  lea

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Vectorize the processed questions using the existing TF-IDF vectorizer
# Note: We use 'transform' instead of 'fit_transform' because the vectorizer has already been fit to the CLO descriptions
questions_tfidf = vectorizer.transform(questions_df['Processed_Question'])

# Calculate similarity between each question and each CLO description
similarity_scores = cosine_similarity(questions_tfidf, tfidf_matrix)

# For each question, find the CLO with the highest similarity score
questions_df['Most_Relevant_CLO'] = similarity_scores.argmax(axis=1)

# Map the index to the actual CLO
questions_df['Most_Relevant_CLO'] = questions_df['Most_Relevant_CLO'].apply(lambda x: clos_df['CLO'][x])

print("Questions DataFrame with Matched CLOs:\n", questions_df[['Question', 'Most_Relevant_CLO']])


Questions DataFrame with Matched CLOs:
                                              Question  \
0          What is Natural Language Processing (NLP)?   
1        Describe the process of tokenization in NLP.   
2       What are stop words and why are they removed?   
3   Explain the concept of stemming and lemmatizat...   
4                           What is a language model?   
..                                                ...   
95     What are the components of a dialogue system??   
96        How do chatbots understand human language??   
97  What ethical considerations are important in N...   
98     How can bias in language models be mitigated??   
99    Define the role of privacy in NLP applications?   

                                    Most_Relevant_CLO  
0   CLO1: Understand the Foundations of Natural La...  
1               CLO2: Text Preprocessing and Cleaning  
2   CLO1: Understand the Foundations of Natural La...  
3               CLO2: Text Preprocessing and Cleani

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Continuing from your provided code...

# Correctly mapping the index to the actual CLO names
questions_df['Most_Relevant_CLO'] = similarity_scores.argmax(axis=1)
questions_df['Most_Relevant_CLO_Name'] = questions_df['Most_Relevant_CLO'].apply(lambda x: clos_df['CLO'].iloc[x])

# Now, encode the CLO names into numerical indices for model training
label_encoder = LabelEncoder()
questions_df['Most_Relevant_CLO_Index'] = label_encoder.fit_transform(questions_df['Most_Relevant_CLO_Name'])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(questions_tfidf, questions_df['Most_Relevant_CLO_Index'], test_size=0.2, random_state=42)

# Training a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")


Model Accuracy: 1.0


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Initialize the sentence transformer model for generating embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for both CLO descriptions and questions
clos_embeddings = model.encode(clos_df['Processed_Description'].tolist())
questions_embeddings = model.encode(questions_df['Processed_Question'].tolist())

# Use LabelEncoder to convert CLO names into numerical labels
label_encoder = LabelEncoder()
questions_df['Most_Relevant_CLO_Index'] = label_encoder.fit_transform(questions_df['Most_Relevant_CLO'])

# Split the data into training and testing sets using the embeddings
X_train, X_test, y_train, y_test = train_test_split(questions_embeddings, questions_df['Most_Relevant_CLO_Index'], test_size=0.2, random_state=42)

# Initialize and train the SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"SVM Model Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SVM Model Accuracy: 0.95
              precision    recall  f1-score   support

           0       1.00      0.88      0.93         8
           1       1.00      1.00      1.00         1
           2       1.00      1.00      1.00         4
           4       0.50      1.00      0.67         1
           5       1.00      1.00      1.00         4
           7       1.00      1.00      1.00         2

    accuracy                           0.95        20
   macro avg       0.92      0.98      0.93        20
weighted avg       0.97      0.95      0.96        20



In [None]:
from sklearn.model_selection import cross_val_score

# Using the entire dataset for cross-validation
X = questions_embeddings
y = questions_df['Most_Relevant_CLO_Index']

# Initialize the SVM model
svm_model = SVC(kernel='linear')

# Perform cross-validation and print the scores
cv_scores = cross_val_score(svm_model, X, y, cv=5)
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


Cross-Validation Accuracy Scores: [1. 1. 1. 1. 1.]
Mean CV Accuracy: 1.0


In [None]:
from sklearn.model_selection import GridSearchCV

# Define a range of hyperparameters for tuning
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(SVC(), param_grid, cv=5, verbose=2)

# Perform grid search on the entire dataset (consider using a subset or SMOTE balanced set)
grid_search.fit(X, y)

print("Best Parameters:", grid_search.best_params_)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=   0.0s
[CV] END ....................C=0.1, gamma=scale

In [None]:
# Assuming the use of sentence-transformers was the initial step, consider experimenting with different models
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Generate new embeddings
new_embeddings = model.encode(questions_df['Processed_Question'].tolist())

# Continue with model training and evaluation using these new embeddings


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train a Random Forest classifier
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

# Evaluate the Random Forest model
y_pred_rf = rf_model.predict(X_test)
print(f"Random Forest Model Accuracy: {accuracy_score(y_test, y_pred_rf)}")


Random Forest Model Accuracy: 0.95


In [None]:
from sklearn.svm import SVC

# Assuming you have your training features and labels in `X_train` and `y_train`
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)  # Fit the model to your training data

# Now you can make predictions
test_predictions = svm_model.predict(test_questions_embeddings)


In [None]:
from joblib import dump, load

# Save the model
dump(svm_model, 'svm_model.joblib')

# Later or in a different session, load the model
svm_model = load('svm_model.joblib')

# Now you can make predictions without retraining
test_predictions = svm_model.predict(test_questions_embeddings)


In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder

# Assuming preprocess_text, model, and svm_model have been defined earlier in your code
# Assuming clos_df contains a column 'CLO' with the CLO descriptions

# Load the test data
test_df = pd.read_csv('advanced_nlp_questions.csv')

# Preprocess the test questions
test_df['Processed_Question'] = test_df['Question'].apply(preprocess_text)

# Use the same model for encoding as used with the training data
test_questions_embeddings = model.encode(test_df['Processed_Question'].tolist())

# Make predictions with your trained model on the test embeddings
test_predictions = svm_model.predict(test_questions_embeddings)

# Append the predictions to your DataFrame
test_df['Predicted_CLO_Index'] = test_predictions

# Convert indices back to original CLO labels
label_encoder = LabelEncoder()
label_encoder.fit(clos_df['CLO'])  # Assuming 'CLO' is the column with CLO descriptions in clos_df
test_df['Predicted_CLO'] = label_encoder.inverse_transform(test_predictions)

# Additionally, if you want to include the matching CLO descriptions as well
# This step assumes that the index in 'clos_df' aligns with 'Predicted_CLO_Index'
# If 'clos_df' has been transformed by LabelEncoder in the same order earlier
test_df['Matching_CLO_Description'] = test_df['Predicted_CLO'].apply(lambda x: clos_df.loc[clos_df['CLO'] == x, 'CLO'].iloc[0])

# Save the DataFrame with predictions and matching CLO descriptions to a new CSV file
test_df.to_csv('output.csv', index=False)
