In [6]:
import pandas as pd
import ast
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process
from nltk.stem.porter import PorterStemmer

# Load and preprocess datasets
def load_and_preprocess_data():
    coursera_df = pd.read_csv(r'C:\Users\Asus\OneDrive\Desktop\AI-Tutor-System\data\coursera_courses.csv')
    udemy_df = pd.read_csv(r'C:\Users\Asus\OneDrive\Desktop\AI-Tutor-System\data\udemy_data.csv')

    coursera_df = coursera_df.drop(['course_students_enrolled', 'course_time', 'course_rating', 'course_reviews_num'], axis=1)
    coursera_df['course_skills'] = coursera_df['course_skills'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
    coursera_df['course_summary'] = coursera_df['course_summary'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
    coursera_df['tags'] = coursera_df.apply(
        lambda row: " ".join(
            row['course_skills'] +
            row['course_summary'] +
            (row['course_description'].split() if isinstance(row['course_description'], str) else [])
        ),
        axis=1
    ).str.lower()

    udemy_df.rename(columns={
        'course_name': 'course_title',
        'instructor': 'course_organization',
        'course url': 'course_url',
        'course description': 'tags',
        'level': 'course_difficulty'
    }, inplace=True)
    udemy_df['tags'] = udemy_df['tags'].str.lower() if udemy_df['tags'].dtype == 'object' else ""

    combined_df = pd.concat([udemy_df, coursera_df], ignore_index=True)
    combined_df['course_difficulty'] = combined_df['course_difficulty'].replace({'Mixed': 'All Levels', 'Expert': 'Advanced'})
    return combined_df.sample(frac=1).reset_index(drop=True)

def preprocess_text(text, stemmer):
    return " ".join([stemmer.stem(word) for word in text.split()]) if isinstance(text, str) else ""

# Initialize data and models
df = load_and_preprocess_data()
stemmer = PorterStemmer()
df['tags'] = df['tags'].apply(lambda x: preprocess_text(x, stemmer))

vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
vectorized_tags = vectorizer.fit_transform(df['tags']).toarray()
similarity_matrix = cosine_similarity(vectorized_tags)

# Save processed data and models to pickle
with open('course_recommendation.pkl', 'wb') as f:
    pickle.dump((df, vectorizer, similarity_matrix), f)

# Abbreviation mapping
abbreviation_mapping = {
    'C': 'C Programming', 'C++': 'C Plus Plus', 'C#': 'C Sharp Programming',
    'Java': 'Java Programming', 'JS': 'JavaScript Programming', 'Python': 'Python Programming',
    'SQL': 'SQL Programming', 'HTML': 'HTML', 'CSS': 'CSS', 'React': 'React Framework',
    'Data Science': 'Data Science', 'AI': 'Artificial Intelligence', 'Web Dev': 'Web Development'
}

def preprocess_course_name(name):
    return ' '.join([abbreviation_mapping.get(word, word) for word in name.split()])

def recommend_courses(course_name, difficulty=None):
    # Load processed data from pickle
    with open('course_recommendation.pkl', 'rb') as f:
        df, vectorizer, similarity_matrix = pickle.load(f)
    
    top_n = 10
    difficulty_map = {'beginner': 'Beginner', 'intermediate': 'Intermediate', 'advanced': 'Advanced'}

    course_name = preprocess_course_name(course_name)
    title_matches = process.extract(course_name, df['course_title'], limit=top_n * 2)
    tag_matches = process.extract(course_name, df['tags'], limit=top_n * 2)

    matched_indices = {match[2] for match in title_matches + tag_matches if match[1] > 89}
    if not matched_indices:
        return {"message": f"No courses found matching the name '{course_name}'."}

    prioritized, all_levels, others = [], [], []
    target_difficulty = difficulty_map.get(difficulty.lower(), None) if difficulty else None

    for idx in matched_indices:
        course_diff = df.iloc[idx]['course_difficulty']
        if difficulty:
            if course_diff == target_difficulty:
                prioritized.append(idx)
            elif course_diff == 'All Levels':
                all_levels.append(idx)
            elif target_difficulty == 'Advanced' and course_diff in ['Beginner', 'Intermediate']:
                continue
            elif target_difficulty == 'Intermediate' and course_diff == 'Beginner':
                continue
            else:
                others.append(idx)
        else:
            prioritized.append(idx)

    prioritized.extend(all_levels if not prioritized else [])
    prioritized.extend(others[:top_n - len(prioritized)])

    scores = [(idx, similarity_matrix[idx].max()) for idx in prioritized]
    sorted_courses = sorted(scores, key=lambda x: x[1], reverse=True)[:top_n]

    return {
        "recommended_courses": [
            {
                "course_title": df.iloc[idx]['course_title'],
                "organization": df.iloc[idx]['course_organization'],
                "difficulty": df.iloc[idx]['course_difficulty'],
                "url": df.iloc[idx]['course_url']
            }
            for idx, _ in sorted_courses
        ]
    }

# Example usage
course_name_input = "Python"
difficulty_input = "Beginner"
recommendations = recommend_courses(course_name_input, difficulty_input)
print(recommendations)

{'recommended_courses': [{'course_title': 'Python for Beginners with Examples', 'organization': 'Ardit Sulce', 'difficulty': 'Beginner', 'url': 'https://www.udemy.com/course/ardit-sulce-python-for-beginners/'}, {'course_title': 'Python Programming Beginners Tutorial : Python 3 Programming', 'organization': 'Ajay Tech', 'difficulty': 'Beginner', 'url': 'https://www.udemy.com/course/a-laymans-guide-to-python/'}, {'course_title': 'Python Programming Complete Beginners Course Bootcamp 2022', 'organization': 'Dmitry Yedunov', 'difficulty': 'Beginner', 'url': 'https://www.udemy.com/course/python-programming-complete-beginner-course-bootcamp/'}, {'course_title': 'Python Programming for Begineers', 'organization': 'Arunachala Damodar', 'difficulty': 'Beginner', 'url': 'https://www.udemy.com/course/python-for-machine-learning-t/'}, {'course_title': 'JavaScript And PHP And Python Programming Complete Course', 'organization': 'PROPER DOT INSTITUTE', 'difficulty': 'Beginner', 'url': 'https://www.u