In [1]:
import pymysql
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datetime import datetime

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize NLTK tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Database connection details
HOST = "localhost"
USER = "root"
PASSWORD = "root"
DATABASE = "bookrecommendation"

# Function to check if a table exists
def table_exists(cursor, table_name):
    cursor.execute("SHOW TABLES LIKE %s", (table_name,))
    return cursor.fetchone() is not None

# Function to create Ratings table if it doesn't exist
def create_ratings_table(cursor):
    create_table_query = """
    CREATE TABLE IF NOT EXISTS Ratings (
        User_ID VARCHAR(50),
        ISBN VARCHAR(20),
        Book_Rating INT,
        PRIMARY KEY (User_ID, ISBN)
    )"""
    cursor.execute(create_table_query)

try:
    # Connect to the MySQL database
    connection = pymysql.connect(
        host=HOST,
        user=USER,
        password=PASSWORD,
        database=DATABASE,
        charset='utf8mb4',
        cursorclass=pymysql.cursors.DictCursor
    )
    print("Connected to MySQL Database")
    cursor = connection.cursor()

    # Create the Ratings table if it doesn't exist
    if not table_exists(cursor, "ratings"):  # Ensure lowercase table names
        create_ratings_table(cursor)
        print("Ratings table created successfully")

    # Query to extract data from each table
    queries = {
        "users": "SELECT * FROM users",
        "books": "SELECT * FROM books",
        "ratings": "SELECT * FROM ratings"
    }

    # Extract data into DataFrames
    data_frames = {}
    for table_name, query in queries.items():
        cursor.execute(query)
        result = cursor.fetchall()  # Fetch results from query
        print(f"\nData from table {table_name}: {result[:5]}")  # Print the first 5 rows for debugging
        data_frames[table_name] = pd.DataFrame(result, columns=[desc[0] for desc in cursor.description])

except pymysql.Error as e:
    print(f"Error: {e}")

finally:
    # Close the connection
    if connection:
        cursor.close()
        connection.close()
        print("MySQL connection is closed")

# Preprocess Users Table
if "users" in data_frames:
    users_df = data_frames["users"]
    print("\nUsers Data:")
    print(users_df.head())  # Check Users DataFrame structure
    users_df.fillna({"Location": "unknown"}, inplace=True)  # Handle missing locations

# Preprocess Books Table
if "books" in data_frames:
    books_df = data_frames["books"]
    print("\nBooks Data:")
    print(books_df.head())  # Check Books DataFrame structure

# Preprocess Ratings Table
if "ratings" in data_frames:
    ratings_df = data_frames["ratings"]
    print("\nRatings Data:")
    print(ratings_df.head())  # Check Ratings DataFrame structure
    ratings_df.fillna({"Book_Rating": 0}, inplace=True)
    ratings_df['Book_Rating'] = pd.to_numeric(ratings_df['Book_Rating'], errors='coerce')
    ratings_df.dropna(subset=['Book_Rating'], inplace=True)

# Save preprocessed data back to CSV (optional)
users_df.to_csv("Preprocessed_Users.csv", index=False)
books_df.to_csv("Preprocessed_Books.csv", index=False)
ratings_df.to_csv("Preprocessed_Ratings.csv", index=False)

# Display preprocessed data
print("\nPreprocessed Users Data:")
print(users_df.head())

print("\nPreprocessed Books Data:")
print(books_df.head())

print("\nPreprocessed Ratings Data:")
print(ratings_df.head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Connected to MySQL Database

Data from table users: [{'User_ID': 1, 'Location': 'nyc, new york, usa', 'Age': 0}, {'User_ID': 2, 'Location': 'stockton, california, usa', 'Age': 18}, {'User_ID': 3, 'Location': 'moscow, yukon territory, russia', 'Age': 0}, {'User_ID': 4, 'Location': 'porto, v.n.gaia, portugal', 'Age': 17}, {'User_ID': 5, 'Location': 'farnborough, hants, united kingdom', 'Age': 0}]

Data from table books: [{'ISBN': '0000913154', 'Book_Title': 'The Way Things Work: An Illustrated Encyclopedia of Technology', 'Book_Author': 'C. van Amerongen (translator)', 'Year_Of_Publication': 1967, 'Publisher': 'Simon &amp; Schuster', 'Image_URL_S': 'http://images.amazon.com/images/P/0000913154.01.THUMBZZZ.jpg', 'Image_URL_M': 'http://images.amazon.com/images/P/0000913154.01.MZZZZZZZ.jpg', 'Image_URL_L': 'http://images.amazon.com/images/P/0000913154.01.LZZZZZZZ.jpg'}, {'ISBN': '0001010565', 'Book_Title': "Mog's Christmas", 'Book_Author': 'Judith Kerr', 'Year_Of_Publication': 1992, 'Publis

### content based filtering


In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Load preprocessed data
users_df = pd.read_csv("Preprocessed_Users.csv")
books_df = pd.read_csv("Preprocessed_Books.csv")
ratings_df = pd.read_csv("Preprocessed_Ratings.csv")

# Standardize column names
users_df.rename(columns={'User-ID': 'User_ID'}, inplace=True)
books_df.rename(columns={'Book-Title': 'Book_Title', 'Book-Author': 'Book_Author', 'Year-Of-Publication': 'Year_Of_Publication'}, inplace=True)
ratings_df.rename(columns={'User-ID': 'User_ID'}, inplace=True)

# Merge ratings with books to get book details
ratings_books_df = ratings_df.merge(books_df, on='ISBN', how='inner')

# Aggregate books read by each user
user_book_profiles = ratings_books_df.groupby('User_ID')['Book_Title'].apply(lambda x: " ".join(x)).reset_index()
user_book_profiles.rename(columns={'Book_Title': 'user_profile'}, inplace=True)

# Merge with users to keep only users with reading history
users_df = users_df.merge(user_book_profiles, on='User_ID', how='inner')

# Create book profile using title, author, and publisher
books_df['book_profile'] = books_df.apply(
    lambda x: f"{x['Book_Title']} {x['Book_Author']} {x['Publisher']}" if pd.notnull(x['Book_Title']) else "", axis=1
)

# Remove rows with empty profiles
users_df = users_df[users_df['user_profile'] != ""]
books_df = books_df[books_df['book_profile'] != ""]

# Step 1: Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit features to reduce memory
tfidf_books = tfidf_vectorizer.fit_transform(books_df['book_profile'])
tfidf_users = tfidf_vectorizer.transform(users_df['user_profile'])

# Step 2: Use Nearest Neighbors to find the most similar books
nn_model = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute')
nn_model.fit(tfidf_books)

# Step 3: Get top-5 book recommendations per user
recommendations = {}

for user_idx in range(tfidf_users.shape[0]):
    user_id = users_df.iloc[user_idx]['User_ID']
    distances, indices = nn_model.kneighbors(tfidf_users[user_idx], n_neighbors=5)
    
    top_books = books_df.iloc[indices[0]][['Book_Title', 'Book_Author', 'Publisher']]
    recommendations[user_id] = top_books

# Step 4: Display recommendations
for user_id, rec_books in recommendations.items():
    print(f"\nRecommendations for User {user_id}:")
    print(rec_books.to_string(index=False))


Recommendations for User 246507:
                                                                                                         Book_Title   Book_Author Publisher
To The Blight : Part Two of 'The Eye of the World', The Beginnng of 'The Wheel of Time' (Wheel of Time (Starscape)) Robert Jordan Starscape
                                             The Eye of the World : Book One of 'The Wheel of Time' (Wheel of Time) Robert Jordan Tor Books
                                             The Eye of the World : Book One of 'The Wheel of Time' (Wheel of Time) Robert Jordan Tor Books
                                             The Eye of the World : Book One of 'The Wheel of Time' (Wheel of Time) Robert Jordan Tor Books
                                                                                       The Book of Saladin: A Novel     Tariq Ali     Verso

Recommendations for User 246508:
                                                     Book_Title           Book_Author       