# CSC495 - DataMining - HW#2 - Rinty Chowdhury

In [1486]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

Importing dataframe

In [1487]:
movies_data = pd.read_csv('./ARM/Movies.tsv', sep='\t', header=0)
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [1488]:
ratings_data = pd.read_csv('./ARM/Ratings.tsv', sep='\t', header=0)
ratings_data.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


Merging both of the dataframe

In [1489]:
merged_data = pd.merge(ratings_data, movies_data)
merged_data.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,2,3.5,Jumanji (1995),Adventure|Children|Fantasy
1,5,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
2,13,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
3,29,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
4,34,2,3.0,Jumanji (1995),Adventure|Children|Fantasy


Filter the dataframe based on movies rated greater than or equal to 4.

In [1490]:
highly_rated_data = merged_data.loc[merged_data['rating'] >= 4]

Sort the dataframe based on user id.

In [1491]:
sorted_data = highly_rated_data.sort_values(by='userId', ascending=True)
sorted_data.head()

Unnamed: 0,userId,movieId,rating,title,genres
1316,1,4306,4.0,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...
1246,1,3489,4.0,Hook (1991),Adventure|Comedy|Fantasy
1243,1,3479,4.0,Ladyhawke (1985),Adventure|Fantasy|Romance
1230,1,3153,4.0,"7th Voyage of Sinbad, The (1958)",Action|Adventure|Fantasy
1224,1,3081,4.0,Sleepy Hollow (1999),Fantasy|Horror|Mystery|Romance


Creating the transaction list from the given dataframe.

In [1492]:
transaction_data = sorted_data.groupby(['userId'])['title'].apply(list).values.tolist()

Getting the frequent itemsets from the transaction list. using min-support of 0.1.

In [1493]:
te = TransactionEncoder()
te_ary = te.fit(transaction_data).transform(transaction_data)
df = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)
frequent_itemsets.head()

Unnamed: 0,support,itemsets
75,0.417582,"(Shawshank Redemption, The (1994))"
30,0.395604,(Forrest Gump (1994))
67,0.362637,(Pulp Fiction (1994))
78,0.340659,"(Silence of the Lambs, The (1991))"
15,0.318681,(Braveheart (1995))


Creating the confidence metric from the frequent itemsets using min-threshold of 0.8

In [1494]:
confident_metric = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)
confident_metric["antecedents_len"] = confident_metric["antecedents"].apply(lambda x: len(x))
confident_metric["consequents_len"] = confident_metric["consequents"].apply(lambda x: len(x))
confident_metric = confident_metric[confident_metric.antecedents_len < 2]
confident_metric = confident_metric[confident_metric.consequents_len < 2]
confident_metric = confident_metric.drop(axis=1, columns='antecedents_len')
confident_metric = confident_metric.drop(axis=1, columns='consequents_len')
confident_metric = confident_metric.sort_values(by='confidence', ascending=False)
confident_metric.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
15,(Speed (1994)),(Terminator 2: Judgment Day (1991)),0.153846,0.296703,0.153846,1.0,3.37037,0.108199,inf
25,(Reservoir Dogs (1992)),"(Silence of the Lambs, The (1991))",0.142857,0.340659,0.142857,1.0,2.935484,0.094192,inf
37,"(Godfather: Part II, The (1974))","(Godfather, The (1972))",0.153846,0.230769,0.142857,0.928571,4.02381,0.107354,10.769231
94,(Reservoir Dogs (1992)),"(Usual Suspects, The (1995))",0.142857,0.252747,0.131868,0.923077,3.652174,0.095761,9.714286
88,(Reservoir Dogs (1992)),(Pulp Fiction (1994)),0.142857,0.362637,0.131868,0.923077,2.545455,0.080063,8.285714


Replacing title of the movie with genre of the movie.

In [1495]:
sorted_data[['1','2', '3', '4', '5', '6', '7']] = sorted_data.genres.str.split("|",expand=True)
sorted_data = sorted_data.dropna(axis='columns')
sorted_data['title'] = sorted_data['1']
sorted_data = sorted_data.sort_values(by='userId', ascending=True)
sorted_data.head()

Unnamed: 0,userId,movieId,rating,title,genres,1
1316,1,4306,4.0,Adventure,Adventure|Animation|Children|Comedy|Fantasy|Ro...,Adventure
1380,1,5026,4.0,Action,Action|Mystery|Thriller,Action
1362,1,4993,5.0,Adventure,Adventure|Fantasy,Adventure
1358,1,4911,4.0,Adventure,Adventure|Comedy|Fantasy,Adventure
1350,1,4896,4.0,Adventure,Adventure|Children|Fantasy,Adventure


Creating the transaction list from the genre dataframe.

In [1496]:
genre_transaction_data = sorted_data.groupby(['userId'])['title'].apply(list).values.tolist()

Getting the frequent itemsets from the genre transaction list using min-support of 0.4.

In [1497]:
genre_te = TransactionEncoder()
genre_te_ary = genre_te.fit(genre_transaction_data).transform(genre_transaction_data)
genre_df = pd.DataFrame(genre_te_ary, columns=te2.columns_)
genre_frequent_itemsets = apriori(genre_df, min_support=0.4, use_colnames=True)
genre_frequent_itemsets = genre_frequent_itemsets.sort_values(by='support', ascending=False)
genre_frequent_itemsets.head()

Unnamed: 0,support,itemsets
4,0.978022,(Comedy)
0,0.956044,(Action)
6,0.956044,(Drama)
26,0.945055,"(Comedy, Drama)"
11,0.934066,"(Comedy, Action)"


Creating the genre confidence metric from the frequent itemsets using min-threshold of 0.9

In [1498]:
genre_confident_metric = association_rules(genre_frequent_itemsets, metric="confidence", min_threshold=0.9)
genre_confident_metric["antecedents_len"] = genre_confident_metric["antecedents"].apply(lambda x: len(x))
genre_confident_metric["consequents_len"] = genre_confident_metric["consequents"].apply(lambda x: len(x))
genre_confident_metric = genre_confident_metric[genre_confident_metric.antecedents_len < 2]
genre_confident_metric = genre_confident_metric[genre_confident_metric.consequents_len < 2]
genre_confident_metric = genre_confident_metric.drop(axis=1, columns='antecedents_len')
genre_confident_metric = genre_confident_metric.drop(axis=1, columns='consequents_len')
genre_confident_metric = genre_confident_metric.sort_values(by='confidence', ascending=False)
genre_confident_metric.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
233,(Mystery),(Drama),0.417582,0.956044,0.417582,1.0,1.045977,0.018355,inf
232,(Mystery),(Comedy),0.417582,0.978022,0.417582,1.0,1.022472,0.009178,inf
225,(Mystery),(Adventure),0.417582,0.901099,0.417582,1.0,1.109756,0.041299,inf
189,(Mystery),(Action),0.417582,0.956044,0.417582,1.0,1.045977,0.018355,inf
13,(Adventure),(Comedy),0.901099,0.978022,0.901099,1.0,1.022472,0.019804,inf


### Description of  choice of algorithm, interestingness measures, and thresholds: 
I have used Apriori algorithm for data mining. Because based on my data set, it was betther choice to use Apriori alogirthm. It reduced the items from my data. Even though FP tree is better algorithm for long run but for my data mining Apriori is efficient enough. It also removed unnessary trnasaction and comparisons. That's why I used Apriori algorithm. For interestingness measures, I used min-support of 0.1 and confidence metric with min-threshold of 0.8 for movie title rule mining. I used 0.1 min support for title rule mining, because it was giving me more frequent itemsets which I needed for my rule mining. Then I used min-threshold of 0.8 to build my confidence metric. If I use anything higher than 0.8, it was not giving me enough rules to look at. I have used min-support of 0.4 and confidence metric with min-threshold of 0.9 for movie genre rule mining. By using 0.4 min-support, it was giving enough frequent itemsets for my rule minig. Also I used 0.9 confidence, because higher confidence is better for rule mining.

### Description of top 20 relationships:
Based on the title rule mining, Speed and Terminator 2: Judgment Day movies has 15% support. It means both of the movie will show 15% of the time together in the dataset. They have 100% confidence which means if a user rate Speed highly then that user will rate Terminat 2: Judgment Day highly 100% of the time. This relationship is true for every pair of antecedents and consequents. Another interesting fact about this rule mining is that every pair of antecedents and consequents has movie release year gap of maximum 4-5 years. Which means if a user rates a movie highly in a certain year, then there is minimum 83% confidence that a user will also rate movies highly released in next 4-5 years. Based on the genre rule mining, Mystery and Drama has 42% support. It means both of the grene will show 42% of the time together in the dataset. They have 100% confidence which means a user rate Mystery then that user will rate Drama highly 100% of the time. This relationship is trur for every pair. There are some other interesting relation between genres. For example, if a user rates mystery genre highly then there is minimum 95% confidence that that user will also rate drama, comedy, adventure, and action genres highly. Also, if a user rates adventure genre highly then there is minimum 95% confidence that that user will also rate comedy, drama, and action genres highly. If a user rates animation genre highly then there is minimum 95% confidence that that user will also rate drama, comedy, and adventure genres highly. If a user rates crime genre highly then there is minimum 95% confidence that that user will also rate drama,  and comedy genres highly. If a user rates drama genre highly then there is minimum 95% confidence that that user will also rate comedy genre highly. If a user rates children genre highly then there is minimum 95% confidence that that user will also rate drama, comedy, and action genres highly. If a user rates action genre highly then there is minimum 95% confidence that that user will also rate drama, and comedy genres highly. If a user rates comedy genre highly then there is minimum 95% confidence that that user will also rate drama, and action genres highly.