In [1]:
import pandas as pd

# load dataset
df = pd.read_csv("../data/data.csv")

# basic checks
df.head()


Unnamed: 0,Job Title,Description
0,Data Analyst,job overview were seeking a data analyst to tu...
1,Data Reporting Analyst,about wspc wspc is a cooperative of outstandin...
2,Data Analyst (Power BI/Python),data analyst power bipython employment type fu...
3,Data & Reporting Analyst,our company pharmerica overview the data repor...
4,Data Quality Analyst (Remote Opportunity),vetsez is seeking a data quality analyst teste...


In [4]:
print("Columns:", df.columns.tolist())
print("Number of rows:", len(df))


Columns: ['Job Title', 'Description']
Number of rows: 521


In [5]:
df['Description'].iloc[0]

'job overview were seeking a data analyst to turn reports from dispatch, safety, accounting, and maintenance into actionable insights and visual dashboards. youll help drive smarter decisions across our trucking operations by translating data into strategies that improve efficiency and performance. key responsibilities analyze data across departments to uncover trends and recommend improvements build and maintain dashboards to track kpis cost per mile, fuel efficiency, driver performance, etc. collaborate with team leads to identify reporting needs and deliver insights conduct etl processes to clean and prepare data present findings clearly via reports and visualizations requirements proficient in excel and power bi, tableau or similar tools strong data analysis and critical thinking skills experience with sql and etl processes preferred bonus familiarity with python or r excellent communication skillsable to simplify complex data for stakeholders understanding of logistics or trucking

In [6]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PRISHA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()                      # 1. lowercase
    text = re.sub(r'[^a-z\s]', '', text)     # 2. remove punctuation & numbers
    words = text.split()                     # 3. tokenize
    words = [w for w in words if w not in stop_words]  # 4. remove stopwords
    return " ".join(words)


In [8]:
df.head()

Unnamed: 0,Job Title,Description
0,Data Analyst,job overview were seeking a data analyst to tu...
1,Data Reporting Analyst,about wspc wspc is a cooperative of outstandin...
2,Data Analyst (Power BI/Python),data analyst power bipython employment type fu...
3,Data & Reporting Analyst,our company pharmerica overview the data repor...
4,Data Quality Analyst (Remote Opportunity),vetsez is seeking a data quality analyst teste...


In [10]:
df['clean_description'] = df['Description'].apply(clean_text)


In [12]:
#tf-idf vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=1000,
    ngram_range=(1, 2)
)

tfidf_matrix = vectorizer.fit_transform(df['clean_description'])

tfidf_matrix.shape


(521, 1000)

In [13]:
#sample resume output
sample_resume = """
I am a computer science student with skills in Python, SQL, Excel,
data analysis, and basic data visualization.
"""

clean_resume = clean_text(sample_resume)
clean_resume

'computer science student skills python sql excel data analysis basic data visualization'

In [14]:
resume_vector = vectorizer.transform([clean_resume])

In [16]:
#compute similarity and rank checks
from sklearn.metrics.pairwise import cosine_similarity

similarity_scores = cosine_similarity(resume_vector, tfidf_matrix)

top_indices = similarity_scores[0].argsort()[-5:][::-1]

df.loc[top_indices, ['Job Title', 'Description']]

#finds top 5 most perfect jobs acc to the resume uploaded based on similarity in vector space
#student success data analyst is the most appropriate job choice

Unnamed: 0,Job Title,Description
296,Student Success Data Analyst,job description summary reporting to the assoc...
472,Data Management Analyst,about us metas solutions is a professional ser...
36,Entry Level Business Analyst,"responsibilities collect, clean, and analyze b..."
396,Data Analyst,at nuview analytics we help companies accelera...
239,Data Analyst,about the company our client is a leading priv...


In [17]:
#skill list
skill_list = [
    "python", "sql", "excel", "tableau", "power bi",
    "data analysis", "data visualization",
    "machine learning", "statistics", "communication",
    "reporting", "dashboard", "business analysis"
]


In [18]:
#skill extraction 
def extract_skills(text, skills):
    found_skills = set()
    for skill in skills:
        if skill in text:
            found_skills.add(skill)
    return found_skills


In [19]:
#extract skills from previously defined resume
resume_skills = extract_skills(clean_resume, skill_list)
resume_skills


{'data analysis', 'data visualization', 'excel', 'python', 'sql'}

In [20]:
#compare current skill list with most suitable job
best_job_index = top_indices[0]
best_job_text = df.loc[best_job_index, 'clean_description']
best_job_title = df.loc[best_job_index, 'Job Title']

best_job_title


'Student Success Data Analyst'

In [21]:
#extract skills from job description
job_skills = extract_skills(best_job_text, skill_list)
job_skills


{'communication',
 'dashboard',
 'data analysis',
 'data visualization',
 'power bi',
 'reporting'}

In [22]:
#skill gap analysis
matched_skills = resume_skills.intersection(job_skills)
missing_skills = job_skills.difference(resume_skills)

matched_skills, missing_skills


({'data analysis', 'data visualization'},
 {'communication', 'dashboard', 'power bi', 'reporting'})

In [24]:
print("Recommended Job Role:")
print(best_job_title)

print("\nSkills you already have:")
for skill in matched_skills:
    print("-", skill)

print("\nSkills you should work on:")
for skill in missing_skills:
    print("-", skill)
if not missing_skills:
    print("\nYou already meet all the key skill requirements for this role!")

Recommended Job Role:
Student Success Data Analyst

Skills you already have:
- data visualization
- data analysis

Skills you should work on:
- dashboard
- power bi
- reporting
- communication


In [2]:
courses_df = pd.read_csv("../data/Coursera.csv")
courses_df.head()

Unnamed: 0,partner,course,skills,rating,reviewcount,level,certificatetype,duration,crediteligibility
0,Google,Google Cybersecurity,"{"" Network Security"","" Python Programming"","" L...",4.8,16.4k,Beginner,Professional Certificate,3 - 6 Months,False
1,Google,Google Data Analytics,"{"" Data Analysis"","" R Programming"","" SQL"","" Bu...",4.8,133.4k,Beginner,Professional Certificate,3 - 6 Months,True
2,Google,Google Project Management:,"{"" Project Management"","" Strategy and Operatio...",4.8,97.3k,Beginner,Professional Certificate,3 - 6 Months,True
3,Google,Google Digital Marketing & E-commerce,"{"" Digital Marketing"","" Marketing"","" Marketing...",4.8,21.4k,Beginner,Professional Certificate,3 - 6 Months,False
4,Google,Google IT Support,"{"" Computer Networking"","" Network Architecture...",4.8,181.4k,Beginner,Professional Certificate,3 - 6 Months,True


In [3]:
courses_df.columns

Index(['partner', 'course', 'skills', 'rating', 'reviewcount', 'level',
       'certificatetype', 'duration', 'crediteligibility'],
      dtype='str')

In [4]:
courses_df['skills'].iloc[0]

'{" Network Security"," Python Programming"," Linux"," Cloud Computing"," Algorithms"," Audit"," Computer Programming"," Computer Security Incident Management"," Cryptography"," Databases"," Leadership and Management"," Network Architecture"," Risk Management"," SQL"}'

In [5]:
courses_df['skills_clean'] = courses_df['skills'].str.lower()