In [1]:
import pandas as pd
df = pd.read_excel("Online Retail.xlsx")
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

start_date = pd.to_datetime("2011-09-01")
end_date = pd.to_datetime("2011-11-30")
date_mask = (df['InvoiceDate'] >= start_date) & (df['InvoiceDate'] <= end_date)
subset_data = df[date_mask]

subset_data.to_csv("12012036_12012114.csv", index=False)
subset_data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
320705,565080,20677,PINK POLKADOT BOWL,8,2011-09-01 08:25:00,1.25,13509.0,United Kingdom
320706,565080,22128,PARTY CONES CANDY ASSORTED,24,2011-09-01 08:25:00,1.25,13509.0,United Kingdom
320707,565081,21067,,1,2011-09-01 09:03:00,0.0,,United Kingdom
320708,565082,22423,REGENCY CAKESTAND 3 TIER,2,2011-09-01 09:15:00,12.75,13305.0,United Kingdom
320709,565082,15060B,FAIRY CAKE DESIGN UMBRELLA,8,2011-09-01 09:15:00,3.75,13305.0,United Kingdom


In [3]:
new_df = pd.read_csv("12012036_12012114.csv")

invoice_data = new_df.groupby('InvoiceNo')['StockCode'].apply(list).reset_index()

invoice_data.to_csv("invoice_dataset.csv", index=False)

invoice_data = pd.read_csv("invoice_dataset.csv")
invoice_data.head()

Unnamed: 0,InvoiceNo,StockCode
0,565080,"['20677', '22128']"
1,565081,['21067']
2,565082,"['22423', '15060B', '23245']"
3,565083,"['22609', '22741', '23146', '72351A', '72351B'..."
4,565084,"['23309', '22970', '22988', '22902', '22659', ..."


In [6]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

transactions = invoice_data['StockCode'].apply(lambda x: x.split(','))
transactions[0:5]

0                                [['20677',  '22128']]
1                                          [['21067']]
2                     [['22423',  '15060B',  '23245']]
3    [['22609',  '22741',  '23146',  '72351A',  '72...
4    [['23309',  '22970',  '22988',  '22902',  '226...
Name: StockCode, dtype: object

In [7]:
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)

rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

filtered_rules = rules[(rules['support'] >= 0.01) & (rules['lift'] > 1.0)]
filtered_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,( '21931'),( '20712'),0.036432,0.026628,0.011498,0.315615,11.852764,0.010528,1.422257,0.950251
1,( '20712'),( '21931'),0.026628,0.036432,0.011498,0.431818,11.852764,0.010528,1.695880,0.940680
2,( '22385'),( '20712'),0.024812,0.026628,0.010167,0.409756,15.388204,0.009506,1.649101,0.958805
3,( '20712'),( '22385'),0.026628,0.024812,0.010167,0.381818,15.388204,0.009506,1.577509,0.960594
4,( '22386'),( '20712'),0.037884,0.026628,0.012104,0.319489,11.998257,0.011095,1.430354,0.952749
...,...,...,...,...,...,...,...,...,...,...
2051,"( '23263', '23264')","( '23265', '23266')",0.018034,0.020697,0.012467,0.691275,33.399505,0.012093,3.172090,0.987875
2052,( '23265'),"( '23263', '23266', '23264')",0.028201,0.014161,0.012467,0.442060,31.216243,0.012067,1.766926,0.996056
2053,( '23266'),"( '23263', '23265', '23264')",0.029654,0.014403,0.012467,0.420408,29.188338,0.012040,1.700501,0.995253
2054,( '23264'),"( '23265', '23266', '23263')",0.028080,0.016824,0.012467,0.443966,26.388799,0.011994,1.768192,0.989902


In [8]:
top_10_rules = filtered_rules.sort_values(by='confidence', ascending=False).head(10)
top_10_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1820,"( '23265', '22577')",( '22578'),0.011256,0.04902,0.010409,0.924731,18.864516,0.009857,12.634454,0.957771
1808,"( '22577', '22579')",( '22578'),0.02566,0.04902,0.023602,0.919811,18.764151,0.022344,11.859285,0.971639
389,( 'DOT'),( '21935'),0.01283,0.020576,0.01174,0.915094,44.473585,0.011477,11.535437,0.990219
1940,"( '23171', '23172')",( '23170'),0.011135,0.020092,0.010167,0.913043,45.443164,0.009943,11.268942,0.989007
1474,"( '22356', '20723')",( '20724'),0.013314,0.040547,0.011983,0.9,22.196418,0.011443,9.594529,0.967833
1881,"( '22697', '23170')",( '23171'),0.012104,0.017308,0.010772,0.89,51.420839,0.010563,8.933562,0.992566
1942,"( '23170', '23172')",( '23171'),0.011498,0.017308,0.010167,0.884211,51.086345,0.009968,8.486884,0.99183
2045,"( '23263', '23266', '23264')",( '23265'),0.014161,0.028201,0.012467,0.880342,31.216243,0.012067,8.12146,0.98187
1683,"( '22086', '22578')",( '22577'),0.01404,0.047446,0.012346,0.87931,18.532811,0.01168,7.892589,0.959513
1509,"( '23205', '22356')",( '20724'),0.011862,0.040547,0.010409,0.877551,21.642766,0.009928,7.835532,0.965244


In [9]:
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

# Load the ratings and movies datasets (replace with your data)
ratings = pd.read_csv("ratings.csv")
movies = pd.read_csv("movies.csv")

# Merge the datasets to get movie titles in the ratings dataset
ratings = pd.merge(ratings, movies, on='movieId')

# Filter ratings records for movies in the "Action" genre
action_movies = ratings[ratings['genres'].str.contains('Action')]

# Create a user-movie pivot table for action movies
action_pivot = action_movies.pivot_table(index='userId', columns='title', values='rating')

# Calculate item similarity using Pearson correlation for question 6
item_similarity_pearson = 1 - pairwise_distances(action_pivot.fillna(0), metric='correlation')

# Recommend top 5 similar movies to "Heat" and "Eraser"
def recommend_movies(movie_title, similarity_matrix, n=5):
    if movie_title in action_pivot.columns:
        sim_scores = list(enumerate(similarity_matrix[action_pivot.columns.get_loc(movie_title)]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:n + 1]  # Exclude the movie itself
        similar_movies = [action_pivot.columns[i[0]] for i in sim_scores]
        return similar_movies
    else:
        return []

heat_recommendations = recommend_movies("Heat", item_similarity_pearson)
eraser_recommendations = recommend_movies("Eraser", item_similarity_pearson)

print("Recommendations for Heat:", heat_recommendations)
print("Recommendations for Eraser:", eraser_recommendations)

# For question 7, filter ratings for "Animation" or "Children" genre and calculate item similarity using cosine similarity
animation_children_movies = ratings[ratings['genres'].str.contains('Animation|Children')]

# Create a user-movie pivot table for animation and children movies
animation_children_pivot = animation_children_movies.pivot_table(index='userId', columns='title', values='rating')

# Calculate item similarity using cosine similarity for question 7
item_similarity_cosine = cosine_similarity(animation_children_pivot.fillna(0))

# Recommend top 5 similar movies to "Lion King" and "The Incredibles"
def recommend_movies_cosine(movie_title, similarity_matrix, n=5):
    if movie_title in animation_children_pivot.columns:
        idx = animation_children_pivot.columns.get_loc(movie_title)
        sim_scores = list(enumerate(similarity_matrix[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:n + 1]  # Exclude the movie itself
        similar_movies = [animation_children_pivot.columns[i[0]] for i in sim_scores]
        return similar_movies
    else:
        return []

lion_king_recommendations = recommend_movies_cosine("Lion King", item_similarity_cosine)
incredibles_recommendations = recommend_movies_cosine("The Incredibles", item_similarity_cosine)

print("Recommendations for Lion King:", lion_king_recommendations)
print("Recommendations for The Incredibles:", incredibles_recommendations)


Recommendations for Heat: []
Recommendations for Eraser: []
Recommendations for Lion King: []
Recommendations for The Incredibles: []


In [10]:
import pandas as pd
from surprise import Reader, Dataset, KNNBasic
from surprise.model_selection import GridSearchCV

# Load and preprocess the dataset
ratings_df = pd.read_csv("ratings.csv")
movies_df = pd.read_csv("movies.csv")

# Question 6: Filter out "Action" genre movies and calculate item similarity with Pearson correlation coefficient
action_movies = movies_df[movies_df['genres'].str.contains('Action')]

# Create a new DataFrame with ratings for "Action" genre movies
action_ratings = ratings_df[ratings_df['movieId'].isin(action_movies['movieId'])]

# Create a Surprise dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(action_ratings[['userId', 'movieId', 'rating']], reader)

# Build a similarity matrix using KNNBasic with Pearson correlation
sim_options = {
    'name': 'pearson',
    'user_based': False
}

knn_model = KNNBasic(sim_options=sim_options)
knn_model.fit(data)

# Recommend top 5 similar movies to "Heat" and "Eraser"
movie_names = ['Heat', 'Eraser']
for movie_name in movie_names:
    movie_id = movies_df[movies_df['title'] == movie_name]['movieId'].values[0]
    similar_movies = knn_model.get_neighbors(movie_id, k=5)
    similar_movie_titles = [movies_df[movies_df['movieId'] == movie]['title'].values[0] for movie in similar_movies]
    print(f"Top 5 similar movies to '{movie_name}': {similar_movie_titles}")


ModuleNotFoundError: No module named 'surprise'

In [12]:
# Question 7: Filter out "Animation" or "Children" genre movies and calculate item similarity with cosine similarity
animation_children_movies = movies_df[movies_df['genres'].str.contains('Animation|Children')]

# Create a new DataFrame with ratings for "Animation" or "Children" genre movies
animation_children_ratings = ratings_df[ratings_df['movieId'].isin(animation_children_movies['movieId'])]

# Create a Surprise dataset
data = Dataset.load_from_df(animation_children_ratings[['userId', 'movieId', 'rating']], reader)

# Build a similarity matrix using KNNBasic with cosine similarity
sim_options = {
    'name': 'cosine',
    'user_based': False
}

knn_model = KNNBasic(sim_options=sim_options)
knn_model.fit(data)

# Recommend top 5 similar movies to "Lion King" and "The Incredibles"
movie_names = ['Lion King', 'The Incredibles']
for movie_name in movie_names:
    movie_id = movies_df[movies_df['title'] == movie_name]['movieId'].values[0]
    similar_movies = knn_model.get_neighbors(movie_id, k=5)
    similar_movie_titles = [movies_df[movies_df['movieId'] == movie]['title'].values[0] for movie in similar_movies]
    print(f"Top 5 similar movies to '{movie_name}': {similar_movie_titles}")

NameError: name 'movies_df' is not defined

In [11]:
# Question 8: Use GridSearchCV to find the best model for recommendations in "Action" genre
param_grid = {
    'k': [5, 10, 20],
    'sim_options': {
        'name': ['cosine', 'pearson', 'msd'],
        'user_based': [False]
    }
}

grid_search = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5)
grid_search.fit(data)

# Get the best parameters and model
best_params = grid_search.best_params
best_model = grid_search.best_estimator

print(f"Best parameters: {best_params}")
print(f"Best model: {best_model}")

NameError: name 'GridSearchCV' is not defined