In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')


# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Removing punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenization
    tokens = text.split()

    # Removing stop words and lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    # Joining tokens back into a single string
    text = ' '.join(tokens)

    return text


# Load dataset
resumes_df = pd.read_excel('/content/Resume Ranking Data set.xlsx')

# Fill NaN values with empty strings
resumes_df.fillna('', inplace=True)

# Convert all relevant fields to string before concatenation
columns_to_combine = ['name', 'degree', 'skills', 'work_segment',
                      'projects_segment', 'objectives_segment', 'misc_segment', 'accomplishments_segment']
for column in columns_to_combine:
    resumes_df[column] = resumes_df[column].astype(str)



# Combine relevant fields into a single text field for each resume
resumes_df['combined_text'] = resumes_df[columns_to_combine].agg(' '.join, axis=1)

# Apply preprocessing to the combined text
resumes_df['combined_text'] = resumes_df['combined_text'].apply(preprocess_text)


# Sample job description
job_description = """
Looking for a full stack developer proficient in Python, JavaScript, and Django.
Must have experience with machine learning and deep learning projects.
A degree in Computer Science or related field is preferred.
"""

# Apply preprocessing to the job description
job_description = preprocess_text(job_description)

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the resumes and job description
tfidf_matrix = vectorizer.fit_transform(resumes_df['combined_text'])
job_desc_tfidf = vectorizer.transform([job_description])

# Compute cosine similarity
cosine_similarities = cosine_similarity(job_desc_tfidf, tfidf_matrix).flatten()

# Add similarity scores to the DataFrame
resumes_df['similarity_score'] = cosine_similarities

# Sort resumes by similarity score
ranked_resumes = resumes_df.sort_values(by='similarity_score', ascending=False)

# Output the top 10 resumes with names and similarity scores
top_10_resumes = ranked_resumes.head(10)
print(top_10_resumes[['name', 'similarity_score', 'combined_text']])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                 name  similarity_score  \
101          react js          0.196599   
160     Deep Learning          0.191542   
63        Anand Singh          0.186904   
330     Deep Learning          0.185995   
486         Python          0.184338   
9    Apache Cassandra          0.183969   
62       Google Cloud          0.180753   
604         Python          0.174372   
622  Python Developer          0.174237   
189      Ekata Kumari          0.174127   

                                         combined_text  
101  react j btech b c programming assessment jquer...  
160  deep learning m btech bsc b physic deep learni...  
63   anand singh b msc m b m algorithm deep learnin...  
330  deep learning m btech vehicle maintenance cust...  
486  python btech assessment resume python ion tech...  
9    apache cassandra m btech b api training natura...  
62   google cloud b msc m b m algorithm deep learni...  
604  python m btech coursera completion fit assessm...  
622  python d