In [12]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity



In [13]:
# Load the data
data = pd.read_csv("coursea_data.csv")
data.head()



Unnamed: 0.1,Unnamed: 0,course_title,course_organization,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled
0,134,(ISC)² Systems Security Certified Practitioner...,(ISC)²,SPECIALIZATION,4.7,Beginner,5.3k
1,743,A Crash Course in Causality: Inferring Causal...,University of Pennsylvania,COURSE,4.7,Intermediate,17k
2,874,A Crash Course in Data Science,Johns Hopkins University,COURSE,4.5,Mixed,130k
3,413,A Law Student's Toolkit,Yale University,COURSE,4.7,Mixed,91k
4,635,A Life of Happiness and Fulfillment,Indian School of Business,COURSE,4.8,Mixed,320k


In [14]:
data.shape

(891, 7)

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                891 non-null    int64  
 1   course_title              891 non-null    object 
 2   course_organization       891 non-null    object 
 3   course_Certificate_type   891 non-null    object 
 4   course_rating             891 non-null    float64
 5   course_difficulty         891 non-null    object 
 6   course_students_enrolled  891 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 48.9+ KB


In [16]:
print("\nUnique values in 'course_title' column:")
print(data['course_title'].unique())


Unique values in 'course_title' column:
['(ISC)² Systems Security Certified Practitioner (SSCP)'
 'A Crash Course in Causality:  Inferring Causal Effects from Observational Data'
 'A Crash Course in Data Science' "A Law Student's Toolkit"
 'A Life of Happiness and Fulfillment'
 'ADHD: Everyday Strategies for Elementary Students' 'AI For Everyone'
 'AI For Medical Treatment' 'AI Foundations for Everyone'
 'AI for Medical Diagnosis' 'AI for Medical Prognosis' 'AI for Medicine'
 'AWS Fundamentals' 'AWS Fundamentals: Addressing Security Risk'
 'AWS Fundamentals: Building Serverless Applications'
 'AWS Fundamentals: Going Cloud-Native'
 'AWS Fundamentals: Migrating to the Cloud'
 'Aboriginal Worldviews and Education' 'Academic English: Writing'
 'Accelerated Computer Science Fundamentals' 'Access Controls'
 'Accounting Analytics' 'Accounting for Decision Making'
 'Achieving Personal and Professional Success'
 'Actualización en el manejo del paciente con diabetes mellitus tipo 2'
 'Addictio

In [17]:
print("\nUnique values in 'course_students_enrolled' column:")
print(data['course_students_enrolled'].unique())


Unique values in 'course_students_enrolled' column:
['5.3k' '17k' '130k' '91k' '320k' '39k' '350k' '2.4k' '61k' '12k' '4k'
 '13k' '11k' '27k' '110k' '6.6k' '540k' '22k' '2.9k' '80k' '9.9k' '23k'
 '9.2k' '78k' '190k' '35k' '29k' '14k' '21k' '94k' '69k' '40k' '220k'
 '150k' '18k' '270k' '7.9k' '30k' '36k' '20k' '8.1k' '120k' '71k' '63k'
 '42k' '480k' '97k' '200k' '180k' '4.2k' '310k' '3.9k' '79k' '31k' '15k'
 '10k' '66k' '33k' '56k' '7.3k' '9.7k' '210k' '28k' '6.5k' '55k' '2.3k'
 '8.8k' '88k' '1.9k' '68k' '1.6k' '9.5k' '57k' '26k' '84k' '95k' '5.8k'
 '24k' '67k' '280k' '38k' '77k' '510k' '89k' '48k' '160k' '32k' '340k'
 '82k' '790k' '19k' '51k' '4.8k' '37k' '43k' '6.4k' '49k' '240k' '46k'
 '7.8k' '75k' '81k' '140k' '5.5k' '99k' '100k' '3k' '830k' '740k' '60k'
 '96k' '690k' '44k' '4.5k' '8.2k' '16k' '300k' '8k' '41k' '54k' '9k'
 '380k' '58k' '5.6k' '7.1k' '83k' '3.4k' '1.5k' '230k' '760k' '86k' '45k'
 '7.2k' '1.8k' '4.1k' '76k' '490k' '170k' '260k' '65k' '470k' '400k'
 '4.4k' '62k' '64k'

In [18]:
# Convert 'course_students_enrolled' to numeric, handling 'k' and 'm' suffixes
data['course_students_enrolled'] = data['course_students_enrolled'].str.replace('k', 'e3').str.replace('m', 'e6').astype(float)


In [19]:
print("\nUnique values in 'course_students_enrolled' column:")
print(data['course_students_enrolled'].unique())


Unique values in 'course_students_enrolled' column:
[5.3e+03 1.7e+04 1.3e+05 9.1e+04 3.2e+05 3.9e+04 3.5e+05 2.4e+03 6.1e+04
 1.2e+04 4.0e+03 1.3e+04 1.1e+04 2.7e+04 1.1e+05 6.6e+03 5.4e+05 2.2e+04
 2.9e+03 8.0e+04 9.9e+03 2.3e+04 9.2e+03 7.8e+04 1.9e+05 3.5e+04 2.9e+04
 1.4e+04 2.1e+04 9.4e+04 6.9e+04 4.0e+04 2.2e+05 1.5e+05 1.8e+04 2.7e+05
 7.9e+03 3.0e+04 3.6e+04 2.0e+04 8.1e+03 1.2e+05 7.1e+04 6.3e+04 4.2e+04
 4.8e+05 9.7e+04 2.0e+05 1.8e+05 4.2e+03 3.1e+05 3.9e+03 7.9e+04 3.1e+04
 1.5e+04 1.0e+04 6.6e+04 3.3e+04 5.6e+04 7.3e+03 9.7e+03 2.1e+05 2.8e+04
 6.5e+03 5.5e+04 2.3e+03 8.8e+03 8.8e+04 1.9e+03 6.8e+04 1.6e+03 9.5e+03
 5.7e+04 2.6e+04 8.4e+04 9.5e+04 5.8e+03 2.4e+04 6.7e+04 2.8e+05 3.8e+04
 7.7e+04 5.1e+05 8.9e+04 4.8e+04 1.6e+05 3.2e+04 3.4e+05 8.2e+04 7.9e+05
 1.9e+04 5.1e+04 4.8e+03 3.7e+04 4.3e+04 6.4e+03 4.9e+04 2.4e+05 4.6e+04
 7.8e+03 7.5e+04 8.1e+04 1.4e+05 5.5e+03 9.9e+04 1.0e+05 3.0e+03 8.3e+05
 7.4e+05 6.0e+04 9.6e+04 6.9e+05 4.4e+04 4.5e+03 8.2e+03 1.6e+04 3.0e+0

In [20]:
# Check unique values in 'course_difficulty' and 'course_Certificate_type' columns
print("\nUnique values in 'course_difficulty' column:")
print(data['course_difficulty'].unique())

print("\nUnique values in 'course_Certificate_type' column:")
print(data['course_Certificate_type'].unique())


Unique values in 'course_difficulty' column:
['Beginner' 'Intermediate' 'Mixed' 'Advanced']

Unique values in 'course_Certificate_type' column:
['SPECIALIZATION' 'COURSE' 'PROFESSIONAL CERTIFICATE']


In [21]:
label_encoders = {}
for column in ['course_difficulty', 'course_Certificate_type']:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Feature Engineering
X = data[['course_difficulty', 'course_rating', 'course_students_enrolled']]
y = data['course_title']

# Scaling numerical variables
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train the model
model = RandomForestClassifier()
model.fit(X, y)



In [22]:
# Recommendation Function
def recommend(subject, rating, difficulty, num_recommendations=5):
    # Find courses related to the subject
    relevant_courses = data[data['course_title'].str.contains(subject, case=False)]
    relevant_indices = relevant_courses.index
    
    # Filter the feature matrix X to include only relevant courses
    X_relevant = X[relevant_indices]
    
    # Encode difficulty level
    difficulty_encoded = label_encoders['course_difficulty'].transform([difficulty])[0]
    
    # Scale numerical variables
    input_data = scaler.transform([[difficulty_encoded, rating, 0]])  # Set students enrolled to 0
    
    # Predict courses
    predictions = model.predict_proba(input_data)
    
    # Find similar courses
    similarity = cosine_similarity(X_relevant, input_data)
    top_indices = similarity.flatten().argsort()[::-1][:num_recommendations]
    
    # Print recommended courses
    for index in top_indices:
        course = relevant_courses.iloc[index]
        print(f"Course Title: {course['course_title']}")
        print(f"Organization: {course['course_organization']}")
        print(f"Certificate Type: {label_encoders['course_Certificate_type'].inverse_transform([course['course_Certificate_type']])[0]}")
        print(f"Rating: {course['course_rating']}")
        print(f"Students Enrolled: {course['course_students_enrolled']}")
        print(f"Similarity: {similarity[index][0]}")
        print()


# Example usage
subject = input("Enter your interest (subject): ")
rating = float(input("Enter desired rating (0-5): "))
difficulty = input("Enter desired difficulty level (Beginner/Intermediate/Advanced/Mixed): ")

recommend(subject, rating, difficulty)

Course Title: Introduction to Discrete Mathematics for Computer Science
Organization: National Research University Higher School of Economics
Certificate Type: SPECIALIZATION
Rating: 4.4
Students Enrolled: 75000.0
Similarity: 0.9704957875887253

Course Title: Mathematics for Machine Learning: PCA
Organization: Imperial College London
Certificate Type: COURSE
Rating: 4.0
Students Enrolled: 33000.0
Similarity: 0.9593825210088589

Course Title: Mathematics for Data Science
Organization: National Research University Higher School of Economics
Certificate Type: SPECIALIZATION
Rating: 4.5
Students Enrolled: 12000.0
Similarity: 0.8979547353679762

Course Title: Mathematics for Machine Learning
Organization: Imperial College London
Certificate Type: SPECIALIZATION
Rating: 4.6
Students Enrolled: 150000.0
Similarity: 0.5921308021320744

Course Title: Mathematics for Machine Learning: Multivariate Calculus
Organization: Imperial College London
Certificate Type: COURSE
Rating: 4.7
Students Enrolle