## Building a Course Recommendation System using TF-IDF Vectorization and Cosine Similarity

### Import the necessary libraries and read the dataset

In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display

In [2]:
df = pd.read_excel(r'Dataset.xlsx')

In [3]:
df.head()

Unnamed: 0,What is your name?,What is your gender?,What was your course in UG?,"What is your UG specialization? Major Subject (Eg, Mathematics)",What are your interests?,What are your skills ? (Select multiple if necessary),What was the average CGPA or Percentage obtained in under graduation?,Did you do any certification courses additionally?,"If yes, please specify your certificate course title.",Are you working?,"If yes, then what is/was your first Job title in your current field of work? If Not applicable, write NA.","Have you done masters after undergraduation? If yes, mention your field of masters.(Eg, Masters in Mathematics)"
0,A.Uha Priya,Female,B.Sc,Computer Applications,Cloud computing,"Python,SQL,Java",85.0,Yes,"Linux,Git",Yes,Software,MCA
1,Aadil,Male,B.E,Computer Science Engineering,TechNology,"Critical Thinking, Analytic Thinking, SQL, Pro...",66.5,Yes,Microsoft certification,Yes,Computer Software Engineer,No
2,Aakriti,Female,BA,Psychology,Understand human behaviour,"People management,Communication skills",64.6,Yes,Resilience psychology,No,No,No
3,Aanchal sharma,Female,MBA,Commerce,"Sales/Marketing,Trading,Understand human behav...","Accounting Skills,Critical Thinking",75.525,No,No,Yes,Relationships manager,No
4,Aangkeeta Sarkar,Female,B.Tech,Instrumentation Engineering,TechNology,"\nPLC Allen Bradley,PLC Ladder Logic,LabVIEW,B...",70.68,Yes,Extreme Productivity (Blinkist Summary),Yes,Plant Instrumentation Engineer,No


### Preprocessing the data and EDA

In [4]:
df.columns

Index(['What is your name?', 'What is your gender?',
       'What was your course in UG?',
       'What is your UG specialization? Major Subject (Eg, Mathematics)',
       'What are your interests?',
       'What are your skills ? (Select multiple if necessary)',
       'What was the average CGPA or Percentage obtained in under graduation?',
       'Did you do any certification courses additionally?',
       'If yes, please specify your certificate course title.',
       'Are you working?',
       'If yes, then what is/was your first Job title in your current field of work? If Not applicable, write NA.               ',
       'Have you done masters after undergraduation? If yes, mention your field of masters.(Eg, Masters in Mathematics)'],
      dtype='object')

In [5]:
df.drop('What is your name?', axis=1, inplace=True)
df.drop('What is your gender?', axis=1, inplace=True)
df.drop('What was your course in UG?', axis=1, inplace=True)
df.drop('What was the average CGPA or Percentage obtained in under graduation?', axis=1, inplace=True)

In [6]:
df.rename(columns = {df.columns[0] :'UG_Course', df.columns[1] :'Interests',df.columns[2] :'Skills', df.columns[3] :'Certifications', df.columns[4] :'Certification_Name', df.columns[5] :'Working', df.columns[6]:'Job_Title', df.columns[7]:'Masters'},inplace = True)

In [7]:
df.isnull().sum()

UG_Course             0
Interests             0
Skills                0
Certifications        0
Certification_Name    0
Working               0
Job_Title             0
Masters               0
dtype: int64

In [8]:
df["Skills"] = df["Skills"].astype(str)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1193 entries, 0 to 1192
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   UG_Course           1193 non-null   object
 1   Interests           1193 non-null   object
 2   Skills              1193 non-null   object
 3   Certifications      1193 non-null   object
 4   Certification_Name  1193 non-null   object
 5   Working             1193 non-null   object
 6   Job_Title           1193 non-null   object
 7   Masters             1193 non-null   object
dtypes: object(8)
memory usage: 74.7+ KB


In [10]:
df.describe()

Unnamed: 0,UG_Course,Interests,Skills,Certifications,Certification_Name,Working,Job_Title,Masters
count,1193,1193,1193,1193,1193,1193,1193,1193
unique,290,612,1005,2,523,2,483,237
top,Computer Science Engineering,TechNology,No,Yes,No,Yes,No,No
freq,174,132,45,611,579,712,295,724


In [11]:
df['UG_Course'].unique()

array(['Computer Applications', 'Computer Science Engineering',
       'Psychology', 'Commerce', 'Instrumentation Engineering',
       'Physics ', 'Accountancy', 'Automobile Engineering', 'Marketing ',
       'Automation', 'Science,Maths, Engineering subject',
       'Mechanical Engineering', 'Electrical and Electronics Engineering',
       'Structural Engineeeing ', 'Mining', 'Accountancy ', 'C language',
       'C,c++, java', 'Civil Engineering', 'Management',
       'Electronics and Communication Engineering', 'Sales and marketing',
       'EcoNomics', 'Computer science and business system', 'Pharmacy',
       'Interior design', 'Information TechNology', 'Chemistry',
       'Commerce ', 'Physics', 'Statistics ', 'Information TechNAlogy',
       'Industrial Engineering ', 'High pressure die casting ',
       'Accounting&Finance', 'Computer Science Engineering ',
       'Computer Science ', 'Mathematics',
       'Accounting(business and management) ', 'Sales and Marketing',
       'Co

In [12]:
df.columns

Index(['UG_Course', 'Interests', 'Skills', 'Certifications',
       'Certification_Name', 'Working', 'Job_Title', 'Masters'],
      dtype='object')

### Building Recommendation System

In [13]:
def recommend_course(d):
    global df
    d = pd.DataFrame([d])
    df = pd.concat([df, d], axis=0, ignore_index=True) # Adds the new data row to the dataframe
    # display(df.columns) # ['UG_Course', 'Interests', 'Skills', 'Certifications', 'Certification_Name', 'Working', 'Job_Title', 'Masters', 'Factors']
    
    df['Interests'] = df['Interests'].astype(str)
    df['Skills'] = df['Skills'].astype(str)
    df['Factors'] = df[['Interests','Skills']].apply("-".join, axis=1) # Combines relevant columns into a single column  
    
    for i in range (len(df)):
        has_certification = str(df.Certifications[i])
        certification_name = str(df.Certification_Name[i])
        if has_certification.startswith('Y') == True:
            df.loc[i, 'Factors'] = df.loc[i, 'Factors'] + '-' + certification_name
        i = i+1
        
    for i in range (len(df)):
        is_working = str(df.Working[i])
        job_title = str(df.Job_Title[i])
        if is_working.startswith('Y') == True:
            df.loc[i, 'Factors'] = df.loc[i, 'Factors'] + '-' + job_title      
        i = i + 1 
    
    for i in range(len(df)):
        pursuing_masters = str(df.Masters[i])
        if pursuing_masters.startswith('N') == True:
            continue
        else:
            df.loc[i, 'Factors'] = df.loc[i, 'Factors'] + '-' + pursuing_masters
        i = i + 1
        
    df_f = df.Factors # Extracts the Factors column from the dataframe
    df_f.replace("[^a-zA-Z]"," ",regex=True, inplace=True) # Remove numerical values and symbols from the combined column
    df_f = df_f.to_frame()  # Converts the series to a dataframe
    df_f.columns = ['Factors'] # Renames the column to Factors
    
    # print(df_f['Factors'].isnull().sum())  # Checks for null values in the Factors column
    
    # https://mungingdata.files.wordpress.com/2017/11/tfidf.png?w=640
    tf_idf = TfidfVectorizer(stop_words = "english") # Removes common words from the English language
    tfidf_matrix = tf_idf.fit_transform(df_f["Factors"])  # Transforms the Factors column into a matrix
    # print(tf_idf_matrix.shape)  # Prints the shape of the matrix - (1194,1306)
    tf_idf_matrix_df = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix)
    # Converts the matrix to a dataframe
    # display(tf_idf_matrix_df)
    df_final = tf_idf_matrix_df
    
    y = df_final.iloc[:-1,:]  # Extracts all rows except the last one
    
    # print(y.shape)    
    
    # Cosine Similarity - https://www.youtube.com/watch?v=e9U0QAFbfLI
    similarity_matrix = cosine_similarity(df_final.iloc[[-1],:],y) # Computes the cosine similarity between the last row and all other rows
    similarity_matrix_df = pd.DataFrame(similarity_matrix)  # Converts the matrix to a dataframe
    # display(similarity_matrix_df)  # 1 row × 1194 columns
    
    # print(similarity_matrix[0]) # 1st row of the similarity matrix
    sim_scores = list(enumerate(similarity_matrix[0]))
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse = True)  # Sorts the scores in descending order
    # print(sim_scores)
    
    similarity_idx, similarity_scores  =  [i[0] for i in sim_scores], [i[1] for i in sim_scores]
    # Extracts the index and score from the similarity scores
    
    # Create an empty dataframe with UG_Course and Score columns    
    recommended_courses = pd.DataFrame(columns=["UG_Course", "Score"])
    recommended_courses["UG_Course"] = df.loc[similarity_idx, "UG_Course"]
    #  Assigns the UG_Course values from the original DataFrame df based on the indices (sim_idx) that were sorted by similarity scores.
    # display(recommended_courses)
    
    recommended_courses["Score"] = similarity_scores  # Assigns the similarity scores to the Score column
    
    recommended_courses = recommended_courses.loc[(recommended_courses.UG_Course !='')] # Filters out empty values
    recommended_courses = recommended_courses.drop_duplicates(subset='UG_Course', keep="first") # Drops duplicate values
    # display(recommended_courses)
    
    # First 5 recommended courses    
    course_recommendations = recommended_courses.iloc[0:5,:]
    course_recommendations.reset_index(inplace = True)
    course_recommendation = course_recommendations['UG_Course'].values.tolist()
    
    return course_recommendation

In [14]:
d = {}

# Gather user details
print("Enter your details :")
name = input("Enter your name :  ")
gender = input("Enter your gender :  ")
d['UG_Course'] = ''
d['Interest'] = input("Enter your interests : ")
d['Skills'] = input("Enter your skills : ")

d['Certification'] = input("Did you do any certification courses additionally? : ")
if d['Certification'].lower() == 'yes':
    d['Certificate_name'] = input("Specify your certificate course title: ")

d['Working'] = input("Are you currently working or have you been employed before? :  ")
if d['Working'].lower() == 'yes':
    d['JobTitle'] = input("Enter your job title: ")
    
d['Masters'] = input("Are you currently pursuing masters or have you persued higher education before : ")

Enter your details :
Enter your name :  Shreya
Enter your gender :  Female
Enter your interests : Coding, Maths
Enter your skills : Linux, Java
Did you do any certification courses additionally? : No
Are you currently working or have you been employed before? :  No
Are you currently pursuing masters or have you persued higher education before : No


In [15]:
print(recommend_course(d))

['Computer Science Engineering', 'AutoCAD', 'Information TechNology', 'Computer Science', 'Electrical and Electronics Engineering']


In [16]:
d = {}

# Gather user details
print("Enter your details :")
name = input("Enter your name :  ")
gender = input("Enter your gender :  ")
d['UG_Course'] = ''
d['Interest'] = input("Enter your interests : ")
d['Skills'] = input("Enter your skills : ")

d['Certification'] = input("Did you do any certification courses additionally? : ")
if d['Certification'].lower() == 'yes':
    d['Certificate_name'] = input("Specify your certificate course title: ")

d['Working'] = input("Are you currently working or have you been employed before? :  ")
if d['Working'].lower() == 'yes':
    d['JobTitle'] = input("Enter your job title: ")
    
d['Masters'] = input("Are you currently pursuing masters or have you persued higher education before : ")

Enter your details :
Enter your name :  Shreya
Enter your gender :  Female
Enter your interests : Biology
Enter your skills : Biochemistry
Did you do any certification courses additionally? : No
Are you currently working or have you been employed before? :  No
Are you currently pursuing masters or have you persued higher education before : No


In [17]:
print(recommend_course(d))

['Genetic Engineering', 'BiotechNAlogy', 'Pharmacy', 'Microbiology', 'Computer Applications']
