In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Load employee dataset
employee_data = pd.read_csv('/content/UpdatedResumeDataSet.csv')

# Load KSA dataset
ksa_data = pd.read_csv('/content/ksaNICE.tsv', delimiter='\t')

# Preprocess employee data
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize and remove stop words
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(filtered_tokens)

employee_data['processed_resume'] = employee_data['Resume'].apply(preprocess_text)

# Preprocess KSA data
ksa_data['processed_ksa'] = ksa_data['Knowledge'].apply(preprocess_text)

# Group KSA data by Role ID and aggregate KSA descriptions
grouped_ksa_data = ksa_data.groupby('Role')['processed_ksa'].agg(lambda x: ' '.join(x)).reset_index()

# Vectorize employee resumes and consolidated KSA descriptions
vectorizer = TfidfVectorizer()
employee_matrix = vectorizer.fit_transform(employee_data['processed_resume'])
ksa_matrix = vectorizer.transform(grouped_ksa_data['processed_ksa'])

# Calculate cosine similarity between employee resumes and consolidated KSA descriptions
cosine_similarities = cosine_similarity(employee_matrix, ksa_matrix)

# Find the most suited role for each employee
employee_data['suited_role_id'] = cosine_similarities.argmax(axis=1)
employee_data['suited_role_description'] = grouped_ksa_data.loc[employee_data['suited_role_id'], 'processed_ksa'].values

# Display results
print(employee_data[['Category', 'suited_role_id', 'suited_role_description']])

employee_data[['Category', 'suited_role_id', 'suited_role_description']].to_csv('/content/suited_roles_results.csv', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


         Category  suited_role_id  \
0    Data Science               1   
1    Data Science               1   
2    Data Science               1   
3    Data Science               1   
4    Data Science               1   
..            ...             ...   
957       Testing              13   
958       Testing              13   
959       Testing              13   
960       Testing              12   
961       Testing              13   

                               suited_role_description  
0    knowledge computer networking concepts protoco...  
1    knowledge computer networking concepts protoco...  
2    knowledge computer networking concepts protoco...  
3    knowledge computer networking concepts protoco...  
4    knowledge computer networking concepts protoco...  
..                                                 ...  
957  knowledge risk management processes methods as...  
958  knowledge risk management processes methods as...  
959  knowledge risk management processes m

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Load datasets
employee_data = pd.read_csv('/content/UpdatedResumeDataSet.csv')
ksa_data = pd.read_csv('/content/ksaNICE.tsv', delimiter='\t')

# Preprocess employee data
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize and remove stop words
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(filtered_tokens)

employee_data['processed_resume'] = employee_data['Resume'].apply(preprocess_text)

# Preprocess KSA data
ksa_data['processed_ksa'] = ksa_data['Knowledge'].apply(preprocess_text)

# Group KSA data by Role ID and aggregate KSA descriptions
grouped_ksa_data = ksa_data.groupby('Role')['processed_ksa'].agg(lambda x: ' '.join(x)).reset_index()

# Vectorize employee resumes and consolidated KSA descriptions
vectorizer = TfidfVectorizer()
employee_matrix = vectorizer.fit_transform(employee_data['processed_resume'])
ksa_matrix = vectorizer.transform(grouped_ksa_data['processed_ksa'])

# Calculate cosine similarity between employee resumes and consolidated KSA descriptions
cosine_similarities = cosine_similarity(employee_matrix, ksa_matrix)

# Find the most suited role for each employee
employee_data['suited_role_id'] = cosine_similarities.argmax(axis=1)
employee_data['suited_role_description'] = grouped_ksa_data.loc[employee_data['suited_role_id'], 'processed_ksa'].values
employee_data['suited_role_id'] = grouped_ksa_data.loc[employee_data['suited_role_id'], 'Role'].values

# Save results to a CSV file
output_file_path = '/content/suited_roles_results.csv'
employee_data[['Category', 'suited_role_id', 'suited_role_description']].to_csv(output_file_path, index=False)

# Display confirmation message
print(f"Results saved to: {output_file_path}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Results saved to: /content/suited_roles_results.csv


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Load datasets
employee_data = pd.read_csv('/content/UpdatedResumeDataSet.csv')
ksa_data = pd.read_csv('/content/ksaNICE.tsv', delimiter='\t')

# Preprocess employee data
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize and remove stop words
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(filtered_tokens)

# Fill NaN values with an empty string in 'Resume' column
employee_data['Resume'] = employee_data['Resume'].fillna('')

# Preprocess 'Resume' column
employee_data['processed_resume'] = employee_data['Resume'].apply(preprocess_text)

# Fill NaN values with an empty string in 'Knowledge' column
ksa_data['Knowledge'] = ksa_data['Knowledge'].fillna('')

# Preprocess 'Knowledge' column
ksa_data['processed_ksa'] = ksa_data['Knowledge'].apply(preprocess_text)

# Prepare features and labels
X = employee_data['processed_resume'] + ' ' + ksa_data['processed_ksa']
y = employee_data['Category']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the textual data
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train a Support Vector Machine classifier
clf = SVC(kernel='linear', random_state=42)
clf.fit(X_train_vectorized, y_train)

# Make predictions on the test set
predictions = clf.predict(X_test_vectorized)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_report(y_test, predictions))

# Save results to a CSV file
output_file_path = '/content/classification_results.csv'
results = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
results.to_csv(output_file_path, index=False)

# Display confirmation message
print(f"Results saved to: {output_file_path}")
