## Data Preprocessing + Transforming the data into transactions table

In [None]:
import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/CS412_project/Dataset/filtered_data.txt')
df.head()

Unnamed: 0.1,Unnamed: 0,user_id,rating,movie_id
0,0,1488844,3.0,1
1,1,822109,5.0,1
2,2,885013,4.0,1
3,3,30878,4.0,1
4,4,823519,3.0,1


In [None]:
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,user_id,rating,movie_id
0,1488844,3.0,1
1,822109,5.0,1
2,885013,4.0,1
3,30878,4.0,1
4,823519,3.0,1


In [None]:
df = df.rename(columns={"user_id": "Cust_Id", "rating": "Rating", "movie_id": "Movie_Id"})

### Run this part for cleaning up the raw dataset 

In [None]:
df = pd.read_csv('/content/drive/MyDrive/CS412_project/Dataset/combined_data_1.txt', header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])

df['Rating'] = df['Rating'].astype(float)

print('Dataset 1 shape: {}'.format(df.shape))
print('-Dataset examples-')
print(df.iloc[::5000000, :])

Dataset 1 shape: (24058263, 2)
-Dataset examples-
          Cust_Id  Rating
0              1:     NaN
5000000   2560324     4.0
10000000  2271935     2.0
15000000  1921803     2.0
20000000  1933327     3.0


In [None]:
df.head()

Unnamed: 0,Cust_Id,Rating
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0


In [None]:
df_nan = pd.DataFrame(pd.isnull(df.Rating))
df_nan = df_nan[df_nan['Rating'] == True]
df_nan = df_nan.reset_index()

movie_np = []
movie_id = 1

for i,j in zip(df_nan['index'][1:],df_nan['index'][:-1]):
    # numpy approach
    temp = np.full((1,i-j-1), movie_id)
    movie_np = np.append(movie_np, temp)
    movie_id += 1

# Account for last record and corresponding length
# numpy approach
last_record = np.full((1,len(df) - df_nan.iloc[-1, 0] - 1),movie_id)
movie_np = np.append(movie_np, last_record)

print('Movie numpy: {}'.format(movie_np))
print('Length: {}'.format(len(movie_np)))

Movie numpy: [1.000e+00 1.000e+00 1.000e+00 ... 4.499e+03 4.499e+03 4.499e+03]
Length: 24053764


In [None]:
df = df[pd.notnull(df['Rating'])]

df['Movie_Id'] = movie_np.astype(int)
df['Cust_Id'] = df['Cust_Id'].astype(int)
print('-Dataset examples-')
print(df.iloc[::5000000, :])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


-Dataset examples-
          Cust_Id  Rating  Movie_Id
1         1488844     3.0         1
5000996    501954     2.0       996
10001962   404654     5.0      1962
15002876   886608     2.0      2876
20003825  1193835     2.0      3825


In [None]:
f = ['count','mean']

df_movie_summary = df.groupby('Movie_Id')['Rating'].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary['count'].quantile(0.7),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

print('Movie minimum times of review: {}'.format(movie_benchmark))

df_cust_summary = df.groupby('Cust_Id')['Rating'].agg(f)
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary['count'].quantile(0.7),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index

print('Customer minimum times of review: {}'.format(cust_benchmark))

Movie minimum times of review: 1636.0
Customer minimum times of review: 184.0


In [None]:
print('Original Shape: {}'.format(df.shape))
df = df[~df['Movie_Id'].isin(drop_movie_list)]
df = df[~df['Cust_Id'].isin(drop_cust_list)]
print('After Trim Shape: {}'.format(df.shape))
print('-Data Examples-')
print(df.iloc[::5000000, :])

Original Shape: (54169215, 3)
After Trim Shape: (32942914, 3)
-Data Examples-
          Cust_Id  Rating  Movie_Id
2670      1447783     4.0         8
8214035   1014069     5.0      2874
16468210   524295     4.0      5587
24617627   684098     5.0      8204
32742396  2203094     5.0     11152
41025502  2294728     4.0     13784
49248240  1725181     4.0     16265


In [None]:
df.head()

Unnamed: 0,Cust_Id,Rating,Movie_Id
2670,1447783,4.0,8
2671,588844,5.0,8
2674,2330282,4.0,8
2675,2098867,4.0,8
2676,56069,5.0,8


In [None]:
df = df[df["Rating"]>3]

In [None]:
from pathlib import Path  
filepath = Path('/content/drive/MyDrive/CS412_project/PatternMining/gt3rating.csv')  
df.to_csv(filepath)

NameError: ignored

In [None]:
df = pd.read_csv('/content/drive/MyDrive/CS412_project/PatternMining/gt3rating.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Cust_Id,Rating,Movie_Id
0,1,822109,5.0,1
1,2,885013,4.0,1
2,3,30878,4.0,1
3,6,124105,4.0,1
4,8,1842128,4.0,1


In [None]:
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,Cust_Id,Rating,Movie_Id
0,822109,5.0,1
1,885013,4.0,1
2,30878,4.0,1
3,124105,4.0,1
4,1842128,4.0,1


In [None]:
df["Movie_Id"].nunique()

13077

In [None]:
df["Cust_Id"].nunique()

324450

In [None]:
import random

In [None]:
unique_movies = df["Movie_Id"].unique()
sample_movies = random.sample(list(unique_movies), 5000)

In [14]:
df.head()

Unnamed: 0,Cust_Id,Rating,Movie_Id
0,822109,5.0,1
1,885013,4.0,1
2,30878,4.0,1
3,124105,4.0,1
4,1842128,4.0,1


In [16]:
len(df)

54169215

In [15]:
df_p = pd.pivot_table(df,values='Rating',index='Cust_Id',columns='Movie_Id')

print(df_p.shape)

ValueError: ignored

In [None]:
df_p.head()

Movie_Id,3,8,16,17,18,26,28,30,32,33,...,4472,4474,4478,4479,4485,4488,4490,4492,4493,4496
Cust_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,,,,,,,,,,...,,,,,,,,,,
7,,5.0,,,,,4.0,5.0,,,...,,,,5.0,,,,,,
79,,,,,,,,,,,...,4.0,,,,,,4.0,,,
97,,,,,,,,,,,...,,,,,,,,,,
134,,,,,,,5.0,,,,...,,,,,,,,,,


In [None]:
from pathlib import Path  
filepath = Path('/content/drive/MyDrive/CS412_project/PatternMining/out.csv')  
df_p.to_csv(filepath) 

In [None]:
trans = pd.read_csv('/content/drive/MyDrive/CS412_project/PatternMining/out.csv')
trans.head()

Unnamed: 0,Cust_Id,3,8,16,17,18,26,28,30,32,...,4472,4474,4478,4479,4485,4488,4490,4492,4493,4496
0,6,,,,,,,,,,...,,,,,,,,,,
1,7,,5.0,,,,,4.0,5.0,,...,,,,5.0,,,,,,
2,79,,,,,,,,,,...,4.0,,,,,,4.0,,,
3,97,,,,,,,,,,...,,,,,,,,,,
4,134,,,,,,,5.0,,,...,,,,,,,,,,


In [None]:
trans=trans.drop(columns=['Cust_Id'])

In [None]:
trans = trans.replace(np.nan, False)
trans = trans.replace(4.0, True)
trans = trans.replace(5.0, True)
trans.head()

Unnamed: 0,3,8,16,17,18,26,28,30,32,33,...,4472,4474,4478,4479,4485,4488,4490,4492,4493,4496
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,True,False,False,False,False,True,True,False,False,...,False,False,False,True,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
from pathlib import Path  
filepath = Path('/content/drive/MyDrive/CS412_project/PatternMining/transactions.csv')  
trans.to_csv(filepath, index=False) 

## Reading the transactions dataset and doing the pattern mining

In [None]:
df = pd.read_csv('/content/drive/MyDrive/CS412_project/PatternMining/transactions.csv')
df.head()

Unnamed: 0,3,8,16,17,18,26,28,30,32,33,...,4472,4474,4478,4479,4485,4488,4490,4492,4493,4496
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,True,False,False,False,False,True,True,False,False,...,False,False,False,True,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
!pip install mlxtend --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mlxtend
  Downloading mlxtend-0.21.0-py2.py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 4.1 MB/s 
Installing collected packages: mlxtend
  Attempting uninstall: mlxtend
    Found existing installation: mlxtend 0.14.0
    Uninstalling mlxtend-0.14.0:
      Successfully uninstalled mlxtend-0.14.0
Successfully installed mlxtend-0.21.0


In [None]:
from mlxtend.frequent_patterns import fpgrowth

patterns = fpgrowth(df, min_support=0.2)
patterns

Unnamed: 0,support,itemsets
0,0.643122,(587)
1,0.618833,(746)
2,0.615640,(1291)
3,0.573323,(1199)
4,0.570137,(857)
...,...,...
429,0.201546,"(1291, 534)"
430,0.216967,"(650, 587)"
431,0.208016,"(650, 1291)"
432,0.200514,"(587, 532)"


In [None]:
for index, row in patterns.iterrows():
    patterns['itemsets_list'][index] = list(patterns['itemsets'][index])
patterns.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,support,itemsets,itemsets_list
0,0.643122,(587),[587]
1,0.618833,(746),[746]
2,0.61564,(1291),[1291]
3,0.573323,(1199),[1199]
4,0.570137,(857),[857]


In [None]:
patterns[patterns.itemsets.map(len)==4]

Unnamed: 0,support,itemsets
90,0.249929,"(587, 746, 1291, 1199)"
98,0.241458,"(1291, 857, 746, 587)"
102,0.227703,"(1291, 857, 587, 1199)"
103,0.225466,"(857, 746, 1291, 1199)"
104,0.220028,"(857, 746, 587, 1199)"
113,0.238028,"(1291, 746, 587, 837)"
117,0.231524,"(857, 746, 1291, 837)"
118,0.22343,"(1291, 857, 587, 837)"
119,0.218522,"(857, 746, 587, 837)"
124,0.227431,"(746, 587, 837, 1199)"


In [None]:
filepath = Path('/content/drive/MyDrive/CS412_project/PatternMining/patterns_sup02.csv')  
patterns.to_csv(filepath, index=False) 

In [None]:
movies = pd.read_csv('/content/drive/MyDrive/CS412_project/Dataset/movie_titles.csv', encoding='latin-1')
movies.head()

ParserError: ignored

In [None]:
patterns[patterns.itemsets_list.map(len)>3]

Unnamed: 0,support,itemsets,itemsets_list
0,0.643122,frozenset({587}),[587]
1,0.618833,frozenset({746}),[746]
2,0.615640,frozenset({1291}),[1291]
3,0.573323,frozenset({1199}),[1199]
4,0.570137,frozenset({857}),[857]
...,...,...,...
429,0.201546,"frozenset({1291, 534})","[1291, 534]"
430,0.216967,"frozenset({650, 587})","[650, 587]"
431,0.208016,"frozenset({650, 1291})","[650, 1291]"
432,0.200514,"frozenset({587, 532})","[587, 532]"
