# Big Data Analytics
## Recommendation system using ALS and KNN algorithms



In [14]:
# importing packages
import nltk
import re

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords 
from nltk.tokenize import WordPunctTokenizer

import time
import math
import json
import os

import seaborn as sns; sns.set()
import matplotlib.pyplot as plt

  import pandas.util.testing as tm


In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

In [0]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [4]:
#Importing the data as a pandas dataframe and printing its shape
import pandas as pd
anime_list = pd.read_csv("/content/AnimeMovies.csv", error_bad_lines=False) 
anime_list.shape

(135201, 2)

In [0]:
anime_list.head(10)

Unnamed: 0,workId,workName
0,22429,Itadaki_Seieki♥
1,35366,Eiga_Ensetsu_Seiji_no_Rinrika_Gotou_Shinpei_1926
2,11827,Kuroinu__Kedakaki_Seijo_wa_Hakudaku_ni_Somaru
3,3050,Panchira_Teacher
4,3220,Kanashimi_no_Belladonna
5,38409,Cike_Wu_Liuqi
6,18693,Tenioha__Onnanoko_datte_Honto_wa_Ecchi_da_yo
7,3560,Karen
8,1639,Boku_no_Pico
9,4310,Joshidai__Ecchi_Soudanshitsu


In [5]:
#Importing the data as a pandas dataframe and printing its shape
import pandas as pd
anime_reviews = pd.read_csv("/content/AnimeRatings.csv", error_bad_lines=False) 
anime_reviews.shape

(135201, 3)

In [0]:
anime_reviews.dtypes

UserId           int64
workId           int64
overallRating    int64
dtype: object

In [0]:
anime_reviews.head(10)

Unnamed: 0,UserId,workId,overallRating
0,101,22429,4
1,102,35366,4
2,103,11827,3
3,104,3050,2
4,105,3220,7
5,106,38409,9
6,107,18693,9
7,107,3560,5
8,108,1639,1
9,109,4310,1


In [0]:
len(anime_reviews.overallRating.unique())

10

# ALS matrix Factorization using pyspark dataframe

In [0]:
# Converting Pandas Dataframe "anime_reviews" into Spark Dataframe "anime_sdf" and printing the first 10 rows

anime_reviews = anime_reviews.astype(str) # Converting pandas df to string first
anime_sdf = spark.createDataFrame(anime_reviews)
anime_sdf.show(10, False) # False allows us to show entire content of the columns

+------+------+-------------+
|UserId|workId|overallRating|
+------+------+-------------+
|101   |22429 |4            |
|102   |35366 |4            |
|103   |11827 |3            |
|104   |3050  |2            |
|105   |3220  |7            |
|106   |38409 |9            |
|107   |18693 |9            |
|107   |3560  |5            |
|108   |1639  |1            |
|109   |4310  |1            |
+------+------+-------------+
only showing top 10 rows



In [0]:
# Checking the datatype of our Spark dataframe file anime_sdf
print(type(anime_sdf))

<class 'pyspark.sql.dataframe.DataFrame'>


In [0]:
# Checking the datatypes of each variable in anime_sdf
anime_sdf.printSchema()

root
 |-- workId: string (nullable = true)
 |-- overallRating: string (nullable = true)
 |-- UserId: string (nullable = true)



In [0]:
# Converting data type of UserID, workId and oversallRating into Integer Type
from pyspark.sql.types import *

anime_sdf = anime_sdf.withColumn("User", anime_sdf["UserId"].cast(IntegerType())).drop("UserId").withColumnRenamed("User", "UserId")
anime_sdf = anime_sdf.withColumn("AnimeId", anime_sdf["workId"].cast(IntegerType())).drop("workId").withColumnRenamed("AnimeId", "workId")
anime_sdf = anime_sdf.withColumn("Rating", anime_sdf["overallRating"].cast(IntegerType())).drop("overallRating").withColumnRenamed("Rating", "overallRating")


In [0]:
# import libraries
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.sql import Row


In [0]:
# Training and test split
training_RDD, test_RDD = anime_sdf.randomSplit([.8, .2], seed=10)

In [0]:
training_RDD.show(10)

+------+------+-------------+
|UserId|workId|overallRating|
+------+------+-------------+
|   101|  2476|            5|
|   101| 22429|            4|
|   102|   499|            5|
|   102|   578|            6|
|   102|   619|            4|
|   102|   885|            8|
|   102|  1076|            5|
|   102|  1207|            4|
|   102|  1208|            7|
|   102|  1324|            4|
+------+------+-------------+
only showing top 10 rows



In [0]:
movieRecs.show(10)

+------+--------------------+
|workId|     recommendations|
+------+--------------------+
|  1580|[[11572, 21.28303...|
|  5300|[[41966, 25.82989...|
|   471|[[41000, 15.52625...|
|  1591|[[12987, 13.53316...|
| 11141|[[12041, 13.31108...|
|  1342|[[30850, 16.30801...|
|  2142|[[8883, 19.51063]...|
| 35982|[[11960, 19.59297...|
| 38422|[[12041, 15.77091...|
|   463|[[32865, 17.54358...|
+------+--------------------+
only showing top 10 rows



## Cross Validation for Parameter Tuning of ALS model



In [0]:
# Let's initialize our ALS learner
alstune = ALS()

# Now we set the parameters for the method
alstune.setMaxIter(25)\
.setItemCol('workId')\
.setRatingCol('overallRating')\
.setUserCol('UserId')

ALS_d5c41aa98e75

In [0]:
# Now let's compute an evaluation metric for our test dataset
from pyspark.ml.evaluation import RegressionEvaluator

# Create an RMSE evaluator using the label and predicted columns
reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="overallRating", metricName="rmse")

tolerance = 0.03
ranks = [x for x in range(10,100, 10)]
regularizer = [0.01,0.05, 0.1,0.2,0.5]
errors = [0 for x in ranks]
models = [0 for x in ranks]

innerpar = [0 for x in regularizer]
err = 0
min_error = float('inf')
best_rank = -1
best_par = -10
for rank in ranks:
  # Set the rank here:
  alstune.setRank(rank)

  regs = 0
  for regPar in regularizer:
    alstune.setRegParam(regPar)
    # Create the model with these parameters.
    model = alstune.fit(training_RDD)
    # Run the model to create a prediction. Predict against the validation_df.
    predict_df = model.transform(test_RDD)
    # Remove NaN values from prediction (due to SPARK-14489)
    predicted_ratings_df = predict_df.filter(predict_df.prediction != float('nan'))
    # Run the previously created RMSE evaluator, reg_eval, on the predicted_ratings_df DataFrame
    error = reg_eval.evaluate(predicted_ratings_df)
    innerpar[regs] = error
    
    print ('For rank %s and regPar %s the RMSE is %s' % (rank, regPar, error))
    if error < min_error:
      min_error = error
      best_rank = err
      best_par = regs
    
    regs+=1

  errors[err] = innerpar
  err += 1
  
      
alstune.setRank(ranks[best_rank]).setRegParam(regularizer[best_par])
print ('The best model was trained with rank %s and regularizing parameter %s' % (ranks[best_rank], regularizer[best_par]) )


For rank 10 and regPar 0.01 the RMSE is 4.443476328509145
For rank 10 and regPar 0.05 the RMSE is 3.0623677426752742
For rank 10 and regPar 0.1 the RMSE is 2.74313223798718
For rank 10 and regPar 0.2 the RMSE is 2.5122759252839413
For rank 10 and regPar 0.5 the RMSE is 2.3415304943892
For rank 20 and regPar 0.01 the RMSE is 3.843406575801375
For rank 20 and regPar 0.05 the RMSE is 2.740487923325375
For rank 20 and regPar 0.1 the RMSE is 2.5946525968964536
For rank 20 and regPar 0.2 the RMSE is 2.462899913817558
For rank 20 and regPar 0.5 the RMSE is 2.325755445153372
For rank 30 and regPar 0.01 the RMSE is 3.442050492926527
For rank 30 and regPar 0.05 the RMSE is 2.651383240069091
For rank 30 and regPar 0.1 the RMSE is 2.5603799067720776
For rank 30 and regPar 0.2 the RMSE is 2.4503689846194083
For rank 30 and regPar 0.5 the RMSE is 2.3190955645034186
For rank 40 and regPar 0.01 the RMSE is 3.165018474058281
For rank 40 and regPar 0.05 the RMSE is 2.613736713859944
For rank 40 and regP

In [0]:
# Training the model
als = ALS(maxIter=25,regParam=0.5,rank=80,itemCol="workId",userCol="UserId",ratingCol="overallRating",coldStartStrategy="drop",nonnegative=True)
model = als.fit(training_RDD)

In [0]:
predictions = model.transform(test_RDD)
predictions.show(40)

+------+------+-------------+----------+
|UserId|workId|overallRating|prediction|
+------+------+-------------+----------+
|  1056|   148|            9| 6.6317906|
|   416|   463|            6|  4.578002|
| 21964|   463|            6| 4.2286305|
|  8055|   496|            8| 6.4534717|
| 23047|  1088|            9|   8.96543|
|  4180|  1088|            9|  8.631317|
|  2350|  1088|            2|  6.150444|
| 15436|  1088|            8|   7.64032|
|  4028|  1238|            9|  9.033456|
|  3750|  1342|            8|   4.71697|
| 15782|  1342|            1| 2.0294123|
|   147|  1645|            5| 3.4304492|
| 29002|  1829|            8| 6.5161796|
| 19486|  1829|            6| 6.3939776|
|  1302|  1829|            8|  5.816959|
|  4247|  1829|           10| 7.0332904|
|  5064|  1829|            4| 5.8163476|
|  3916|  2142|            7|  4.567028|
|   107|  3918|            8|  7.134804|
|   372|  5300|            8|  8.997341|
| 20634|  5300|            9| 9.1995325|
|   767|  5300| 

In [0]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test_RDD)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="overallRating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

Root-mean-square error = 2.233732092751394
+------+--------------------+
|UserId|     recommendations|
+------+--------------------+
|   463|[[1238, 10.633916...|
|   471|[[1238, 8.032571]...|
|   496|[[34591, 10.53226...|
|   833|[[681, 4.5473504]...|
|  1088|[[1081, 10.270393...|
|  1238|[[1081, 9.687137]...|
|  1342|[[2070, 2.0770152...|
|  1580|[[413, 9.081915],...|
|  1591|[[1238, 9.322596]...|
|  1645|[[1238, 9.382253]...|
|  1829|[[1238, 9.756073]...|
|  1959|[[1238, 5.742684]...|
|  2122|[[1238, 10.105022...|
|  2142|[[1238, 11.369951...|
|  2866|[[1238, 10.662142...|
|  3175|[[2070, 6.7605], ...|
|  3749|[[1238, 9.451535]...|
|  3918|[[12917, 2.905894...|
|  3997|[[1238, 9.413797]...|
|  4101|[[1238, 10.890756...|
+------+--------------------+
only showing top 20 rows



In [0]:
# Converting in the following format: userID<\tab>itemID1,itemID2,itemID3 ...,itemID10
my_userRecs1 = userRecs.withColumn("Movie_1", userRecs["recommendations"].getItem(0))\
.withColumn("Movie_2", userRecs["recommendations"].getItem(1))\
.withColumn("Movie_3", userRecs["recommendations"].getItem(2))\
.withColumn("Movie_4", userRecs["recommendations"].getItem(3))\
.withColumn("Movie_5", userRecs["recommendations"].getItem(4))\
.withColumn("Movie_6", userRecs["recommendations"].getItem(5))\
.withColumn("Movie_7", userRecs["recommendations"].getItem(6))\
.withColumn("Movie_8", userRecs["recommendations"].getItem(7))\
.withColumn("Movie_9", userRecs["recommendations"].getItem(8))\
.withColumn("Movie_10", userRecs["recommendations"].getItem(9))
my_userRecs1.show()

+------+--------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|UserId|     recommendations|          Movie_1|           Movie_2|           Movie_3|           Movie_4|           Movie_5|           Movie_6|           Movie_7|           Movie_8|           Movie_9|          Movie_10|
+------+--------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|   463|[[1238, 10.633916...|[1238, 10.633916]| [1081, 10.584258]|[2070, 10.1859045]| [28957, 9.942186]| [34591, 9.939552]|   [631, 9.902186]| [34500, 9.902186]| [35247, 9.840441]|   [605, 9.771796]| [37596, 9.768624]|
|   471|[[1238, 8.032571]...| [1238, 8.032571]|  [631, 7.9807615]|[34500, 7.9807615]|  [2070, 7.806849]| [1081, 7.6028767]| 

In [0]:
# Converting in the following format: userID<\tab>itemID1,itemID2,itemID3 ...,itemID10
from pyspark.sql import Row
my_userRecs2 = my_userRecs1.select(my_userRecs1.UserId, my_userRecs1.Movie_1.getField("workId").alias("Movie_1")\
                             ,my_userRecs1.Movie_2.getField("workId").alias("Movie_2")\
                             ,my_userRecs1.Movie_3.getField("workId").alias("Movie_3")\
                             ,my_userRecs1.Movie_4.getField("workId").alias("Movie_4")\
                             ,my_userRecs1.Movie_5.getField("workId").alias("Movie_5")\
                             ,my_userRecs1.Movie_6.getField("workId").alias("Movie_6")\
                             ,my_userRecs1.Movie_7.getField("workId").alias("Movie_7")\
                             ,my_userRecs1.Movie_8.getField("workId").alias("Movie_8")\
                             ,my_userRecs1.Movie_9.getField("workId").alias("Movie_9")\
                             ,my_userRecs1.Movie_10.getField("workId").alias("Movie_10"))
my_userRecs2.show(20)

+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+--------+
|UserId|Movie_1|Movie_2|Movie_3|Movie_4|Movie_5|Movie_6|Movie_7|Movie_8|Movie_9|Movie_10|
+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+--------+
|   463|   1238|   1081|   2070|  28957|  34591|    631|  34500|  35247|    605|   37596|
|   471|   1238|    631|  34500|   2070|   1081|   1014|  10132|    605|   9996|   35180|
|   496|  34591|   1081|   1238|  39415|   1014|  33095|  35247|  28957|  28735|   25303|
|   833|    681|  28105|  26023|   1081|   9435|  35191|   3593|   2070|   2848|   34102|
|  1088|   1081|   1238|   2070|    631|  34500|    605|   3593|  28957|  34591|    1014|
|  1238|   1081|   1238|   3593|   2070|    631|  34500|  10132|   2733|    605|    4017|
|  1342|   2070|  37021|   1081|   1238|    631|  34500|  39415|  10132|  21469|    3100|
|  1580|    413|  34514|   4282|  33993|  34500|    631|   2313|  21831|   1263|    5973|
|  1591|  

In [0]:
import pandas as pd
userRecsCsv = userRecs.toPandas()
userRecsCsv.head()


Unnamed: 0,UserId,recommendations
0,463,"[(15633, 24.748703002929688), (16778, 22.63933..."
1,471,"[(33911, 32.56745910644531), (34652, 29.411079..."
2,496,"[(37283, 20.106273651123047), (39065, 19.82189..."
3,833,"[(3039, 7.203929901123047), (18137, 7.12053775..."
4,1088,"[(8713, 19.652875900268555), (31797, 18.812114..."


In [0]:
#Exporting the Dataframe to CSV file
userRecsCsv.to_csv(r"userRecsCsv.csv", index = False)

# Content Based Filtering using KNN


In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
import pandas.util.testing as tm
%matplotlib inline

  import pandas.util.testing as tm


In [0]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [0]:
# importing anime movies dataset
anime_movies = pd.read_csv("/content/AnimeMovies.csv")

In [0]:
anime_movies.head(5)

Unnamed: 0,workId,workName
0,22429,Itadaki_Seieki♥
1,35366,Eiga_Ensetsu_Seiji_no_Rinrika_Gotou_Shinpei_1926
2,11827,Kuroinu__Kedakaki_Seijo_wa_Hakudaku_ni_Somaru
3,3050,Panchira_Teacher
4,3220,Kanashimi_no_Belladonna


In [0]:
# importing anime ratings dataset
anime_ratings = pd.read_csv("AnimeRatings.csv")

In [0]:
anime_ratings.head(5)

Unnamed: 0,UserId,workId,overallRating
0,101,22429,4
1,102,35366,4
2,103,11827,3
3,104,3050,2
4,105,3220,7


In [0]:
# importing anime genre dataset
anime_genre = pd.read_csv("animeListGenres.csv")

In [9]:
# selecting 4 columns from the dataset
anime_genre= anime_genre[['workId', 'engName', 'episodes', 'genres']]
anime_genre.head(5)

Unnamed: 0,workId,engName,episodes,genres
0,5114,Fullmetal Alchemist: Brotherhood,64,"Action, Adventure, Comedy, Drama, Fantasy, Mag..."
1,32281,Your Name.,1,"Romance, Supernatural, School, Drama"
2,9253,Steins;Gate,24,"Thriller, Sci-Fi"
3,28977,Gintama Season 4,51,"Action, Comedy, Historical, Parody, Samurai, S..."
4,11061,Hunter x Hunter,148,"Action, Adventure, Fantasy, Shounen, Super Power"


In [10]:
# importing complete anime dataset
anime_dataset = pd.read_excel("/content/AnimeDatasetCopy.xlsx")
anime_dataset.head(10)

Unnamed: 0,id,workId,reviewId,workName,postTime,episodesSeen,author,UserId,peopleFoundUseful,overallRating,storyRating,animationRating,soundRating,characterRating,enjoymentRating,review
0,1017626,22429,299562,Itadaki_Seiekiâ™¥,2019-01-20 07:47:00,1,Sekaii-San,101,1,4,2,7,5,4,7,Sorry for my bad English ^.^ ( 80% Google tran...
1,1017400,35366,299647,Eiga_Ensetsu_Seiji_no_Rinrika_Gotou_Shinpei_1926,2019-01-20 20:25:00,1,matthigh,102,1,4,0,0,0,0,0,"As far as the 'animation' goes, this 30-minute..."
2,1017555,11827,299698,Kuroinu__Kedakaki_Seijo_wa_Hakudaku_ni_Somaru,2019-01-21 11:34:00,6,Balddog,103,1,3,3,8,8,6,3,"While I have to admit, the art was really good..."
3,1018890,3050,299893,Panchira_Teacher,2019-01-23 21:29:00,2,DarkWolf6211,104,1,2,1,8,7,2,1,"~MAY CONTAIN SOME SPOILERS~Look, I've seen man..."
4,1017910,3220,300241,Kanashimi_no_Belladonna,2019-01-28 09:33:00,1,ZephSilver,105,27,7,7,9,6,9,9,"Spellbound in a whirlwind of love, sex, desire..."
5,1017397,38409,300351,Cike_Wu_Liuqi,2019-01-29 18:31:00,10,Random_Fodder,106,1,9,8,8,10,9,9,Killer seven is an amazing short Chinese anime...
6,1017935,18693,300398,Tenioha__Onnanoko_datte_Honto_wa_Ecchi_da_yo,2019-01-30 08:36:00,2,Animegrin245,107,1,9,5,10,9,9,10,Welcome to my review of \Tenioha!: Onnanoko da...
7,1019584,3560,300403,Karen,2019-01-30 08:50:00,1,Animegrin245,107,1,5,7,4,5,9,5,"Welcome to my review of \Karen\"" ! I will try ..."
8,1020098,1639,300463,Boku_no_Pico,2019-01-31 04:09:00,1,Twiverse,108,1,1,2,4,1,1,1,That was painful to watch. Not gonna lie.Boku ...
9,1020166,4310,300563,Joshidai__Ecchi_Soudanshitsu,2019-01-31 22:03:00,1,thetickdickler,109,1,1,1,1,1,1,1,"This might be the single worst hentai, let alo..."


In [11]:
#Merging anime dataset with genre dataset
new_anime_df = pd.merge(anime_dataset,anime_genre,on='workId')
new_anime_df

Unnamed: 0,id,workId,reviewId,workName,postTime,episodesSeen,author,UserId,peopleFoundUseful,overallRating,storyRating,animationRating,soundRating,characterRating,enjoymentRating,review,engName,episodes,genres
0,1017626,22429,299562,Itadaki_Seiekiâ™¥,2019-01-20 07:47:00,1,Sekaii-San,101,1,4,2,7,5,4,7,Sorry for my bad English ^.^ ( 80% Google tran...,\N,1,"Hentai, Supernatural"
1,1108536,22429,302981,Itadaki_Seiekiâ™¥,2019-03-05 22:21:00,1,DoomRanger,383,1,3,3,7,7,4,5,Where does Neil even begin with this shit? The...,\N,1,"Hentai, Supernatural"
2,125366,22429,139555,Itadaki_Seiekiâ™¥,2014-04-11 09:42:00,1,MrD0nPa0,6011,39,7,5,10,10,7,7,"The first time i saw this hentai, i was very e...",\N,1,"Hentai, Supernatural"
3,125365,22429,166031,Itadaki_Seiekiâ™¥,2014-10-24 16:38:00,1,crxinfinite,7653,315,10,10,10,10,10,10,"I have watched this over 500 times, and each t...",\N,1,"Hentai, Supernatural"
4,125367,22429,168147,Itadaki_Seiekiâ™¥,2014-11-10 19:02:00,1,lemoncat,7707,18,9,7,10,7,10,10,I watched this anime at a whim and instantly f...,\N,1,"Hentai, Supernatural"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135196,6491,34096,245369,Gintama,2017-04-13 00:28:00,12,Theliel,25121,3,7,10,10,10,10,6,They made this season way too short. 12 episod...,Gintama Season 5,12,"Action, Comedy, Historical, Parody, Samurai, S..."
135197,6488,34096,247625,Gintama,2017-05-13 21:50:00,12,agendator,38828,12,4,4,4,3,6,3,"The fall of Gintama, by a fan of the series.Ba...",Gintama Season 5,12,"Action, Comedy, Historical, Parody, Samurai, S..."
135198,6493,34096,249891,Gintama,2017-06-14 08:31:00,12,claudinou,46256,2,8,7,8,9,8,0,"This was a deception for me, Gintama is my fav...",Gintama Season 5,12,"Action, Comedy, Historical, Parody, Samurai, S..."
135199,6494,34096,255938,Gintama,2017-08-21 04:28:00,12,DesolatePsyche,551,1,8,8,8,10,9,8,"First things first. My \reviews\"" system is ex...",Gintama Season 5,12,"Action, Comedy, Historical, Parody, Samurai, S..."


In [0]:
# converting the datatypes to float
new_anime_df["overallRating"] = new_anime_df["overallRating"].astype(float)
new_anime_df["storyRating"] = new_anime_df["storyRating"].astype(float)
new_anime_df["animationRating"] = new_anime_df["animationRating"].astype(float)
new_anime_df["soundRating"] = new_anime_df["soundRating"].astype(float)
new_anime_df["characterRating"] = new_anime_df["characterRating"].astype(float)
new_anime_df["enjoymentRating"] = new_anime_df["enjoymentRating"].astype(float)

new_anime_df["UserId"] = new_anime_df["UserId"].astype(float)

In [0]:

# Creating dummy variables for genres

anime_features = pd.concat([new_anime_df[["UserId"]],new_anime_df["workId"],new_anime_df["overallRating"],
                            new_anime_df["storyRating"], new_anime_df["animationRating"],new_anime_df["soundRating"],
                            new_anime_df["characterRating"], new_anime_df["enjoymentRating"],
                            new_anime_df["genres"].str.get_dummies(sep=",")],axis=1)
#new_anime_df["workName"] = new_anime_df["workName"].map(lambda name:re.sub('[^A-Za-z0-9]+', " ", name))
anime_features.head()



Unnamed: 0,UserId,workId,overallRating,storyRating,animationRating,soundRating,characterRating,enjoymentRating,"Ace no Saigo""",Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,Hentai,Historical,Horror,Josei,Kids,Magic,Martial Arts,Mecha,Military,Music,Mystery,Parody,Police,Psychological,Romance,Samurai,School,Sci-Fi,Seinen,Shoujo,Shoujo Ai,...,Adventure.1,Cars.1,Comedy.1,Dementia.1,Demons.1,Drama.1,Ecchi.1,Fantasy.1,Game.1,Harem.1,Hentai.1,Historical.1,Horror.1,Josei.1,Kids.1,Magic.1,Martial Arts.1,Mecha.1,Military.1,Music.1,Mystery.1,No genres have been added yet.,Parody.1,Police.1,Psychological.1,Romance.1,Samurai.1,School.1,Sci-Fi.1,Seinen.1,Shoujo.1,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi
0,101.0,22429,4.0,2.0,7.0,5.0,4.0,7.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,383.0,22429,3.0,3.0,7.0,7.0,4.0,5.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,6011.0,22429,7.0,5.0,10.0,10.0,7.0,7.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,7653.0,22429,10.0,10.0,10.0,10.0,10.0,10.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,7707.0,22429,9.0,7.0,10.0,7.0,10.0,10.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:

anime_features.columns

Index(['UserId', 'workId', 'overallRating', 'storyRating', 'animationRating',
       'soundRating', 'characterRating', 'enjoymentRating', ' Ace no Saigo"',
       ' Adventure', ' Cars', ' Comedy', ' Dementia', ' Demons', ' Drama',
       ' Ecchi', ' Fantasy', ' Game', ' Harem', ' Hentai', ' Historical',
       ' Horror', ' Josei', ' Kids', ' Magic', ' Martial Arts', ' Mecha',
       ' Military', ' Music', ' Mystery', ' Parody', ' Police',
       ' Psychological', ' Romance', ' Samurai', ' School', ' Sci-Fi',
       ' Seinen', ' Shoujo', ' Shoujo Ai', ' Shounen', ' Shounen Ai',
       ' Slice of Life', ' Space', ' Sports', ' Super Power', ' Supernatural',
       ' Thriller', ' Vampire', ' Yaoi', ' Yuri', '1', '12', 'Action',
       'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama', 'Ecchi',
       'Fantasy', 'Game', 'Harem', 'Hentai', 'Historical', 'Horror', 'Josei',
       'Kids', 'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music',
       'Mystery', 'No genres have been a

In [0]:
# Scaling the variables using minmaxscaler
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
anime_features[['overallRating','storyRating', 'animationRating', 'soundRating', 'characterRating','enjoymentRating' ]] = min_max_scaler.fit_transform(anime_features[['overallRating','storyRating', 'animationRating', 'soundRating', 'characterRating','enjoymentRating' ]])

np.round(anime_features,2)

Unnamed: 0,UserId,workId,overallRating,storyRating,animationRating,soundRating,characterRating,enjoymentRating,"Ace no Saigo""",Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,Hentai,Historical,Horror,Josei,Kids,Magic,Martial Arts,Mecha,Military,Music,Mystery,Parody,Police,Psychological,Romance,Samurai,School,Sci-Fi,Seinen,Shoujo,Shoujo Ai,...,Adventure.1,Cars.1,Comedy.1,Dementia.1,Demons.1,Drama.1,Ecchi.1,Fantasy.1,Game.1,Harem.1,Hentai.1,Historical.1,Horror.1,Josei.1,Kids.1,Magic.1,Martial Arts.1,Mecha.1,Military.1,Music.1,Mystery.1,No genres have been added yet.,Parody.1,Police.1,Psychological.1,Romance.1,Samurai.1,School.1,Sci-Fi.1,Seinen.1,Shoujo.1,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi
0,101.0,22429,0.36,0.18,0.64,0.45,0.36,0.64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,383.0,22429,0.27,0.27,0.64,0.64,0.36,0.45,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,6011.0,22429,0.64,0.45,0.91,0.91,0.64,0.64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,7653.0,22429,0.91,0.91,0.91,0.91,0.91,0.91,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,7707.0,22429,0.82,0.64,0.91,0.64,0.91,0.91,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135196,25121.0,34096,0.64,0.91,0.91,0.91,0.91,0.55,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
135197,38828.0,34096,0.36,0.36,0.36,0.27,0.55,0.27,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
135198,46256.0,34096,0.73,0.64,0.73,0.82,0.73,0.00,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
135199,551.0,34096,0.73,0.73,0.73,0.91,0.82,0.73,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
# Building nearest neighbor model with brute algorithm
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute').fit(anime_features)

In [0]:
distances2, indices2 = model_knn.kneighbors(anime_features)

In [0]:
distances2

array([[0.00000000e+00, 2.97423852e-09, 3.29165140e-09, 3.69692765e-09,
        4.01089828e-09],
       [2.22044605e-16, 2.77987955e-09, 3.40999773e-09, 3.57078589e-09,
        3.83072063e-09],
       [4.44089210e-16, 4.07563416e-09, 4.78852313e-09, 5.52806156e-09,
        5.80768478e-09],
       ...,
       [2.22044605e-16, 2.37334552e-09, 2.55740074e-09, 2.86031443e-09,
        3.00831049e-09],
       [6.66133815e-16, 3.73870723e-09, 3.94730559e-09, 4.41293813e-09,
        4.43119885e-09],
       [2.22044605e-16, 3.31949834e-09, 3.63557551e-09, 3.84722698e-09,
        4.27959190e-09]])

In [0]:
# Building nearest neighbor model with ball_tree algorithm

nbrs1 = NearestNeighbors(n_neighbors=50, algorithm='ball_tree').fit(anime_features)
distances1, indices1 = nbrs1.kneighbors(anime_features)

In [0]:
distances1, indices1 = nbrs1.kneighbors(anime_features)
distances1

array([[0.        , 0.09090934, 0.09110883, ..., 1.0193712 , 1.01948477,
        1.01953762],
       [0.        , 0.09110883, 0.18191117, ..., 1.02103715, 1.02129364,
        1.02230115],
       [0.        , 0.0855695 , 0.10301433, ..., 1.00479832, 1.0048213 ,
        1.00500556],
       ...,
       [0.        , 0.01864483, 0.08546548, ..., 0.35050797, 0.38450937,
        0.39514424],
       [0.        , 0.04421777, 0.06890308, ..., 0.63497584, 0.63502113,
        0.64008583],
       [0.        , 0.01881588, 0.04723307, ..., 0.56034769, 0.5613624 ,
        0.56543398]])

In [0]:
indices1

array([[     0,     12,      1,      2, 108075,     11],
       [     1,      0,     12, 108076, 108075,      2],
       [     2,     11,      5,     10,      6,      7],
       ...,
       [135198, 135191,  51109, 135190,  51124,  51126],
       [135199, 120791,  51130,  51121,  51106, 135194],
       [135200, 135192, 120800, 135193, 120797, 120798]])

In [0]:
def get_index_from_name(workName):
    return new_anime_df[new_anime_df["workName"]==workName].index.tolist()[0]

In [0]:
all_anime_names = list(new_anime_df.workName.values)

In [0]:
def get_id_from_partial_name(partial):
    for name in all_anime_names:
        if partial in workName:
            print(workName,all_anime_names.index(workName))

In [0]:
""" print_similar_query can search for similar animes both by id and by name. """

def print_similar_animes(query=None,id=None):
    if id:
        for id in indices2[id][1:]:
            print(new_anime_df.iloc[id]["workName"])
    if query:
        found_id = get_index_from_name(query)
        for id in indices2[found_id][1:]:
            print(new_anime_df.iloc[id]["workName"])

In [0]:
def print_similar_animes(query=None,id=None):
    if id:
        for id in indices2[id][1:]:
            print(new_anime_df.iloc[id]["workId"])
    if query:
        found_id = get_index_from_name(query)
        for id in indices2[found_id][1:]:
            print(new_anime_df.iloc[id]["workId"])

## Example Results of inputting a movie Id or movie name to get recommendations

In [0]:
print_similar_animes(id=22450)

Fullmetal_Alchemist__Brotherhood
Kuroshitsuji_II
Darker_than_Black__Ryuusei_no_Gemini
Angel_Beats


In [0]:
print_similar_animes(query="Calicula_Machine")

Hina_no_Uta
Boer_no_Mori_e
Aikagi_The_Animation
Kyou_mo_Chappy_End


In [0]:
print_similar_animes(query="Kimi_no_Na_wa")

Zutto_Mae_kara_Suki_deshita__Kokuhaku_Jikkou_Iinkai
Persona_3_the_Movie_4__Winter_of_Rebirth
Anne_Happyâ™ª
Kuusen_Madoushi_Kouhosei_no_Kyoukan__Lecty_no_Ikimonogatari


In [0]:
print_similar_animes(query="Seiren")

3D_Kanojo__Real_Girl
Release_the_Spyce
Tsurune__Kazemai_Koukou_Kyuudoubu
Kimi_no_Suizou_wo_Tabetai


In [0]:
print_similar_animes(id=1557)

Fukigen_na_Mononokean
Sakamoto_Desu_ga
Kizumonogatari_III__Reiketsu-hen
Hibike_Euphonium_2


# Collaborative Filtering using KNN

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
import pandas.util.testing as tm
%matplotlib inline

  import pandas.util.testing as tm


In [0]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [0]:
# Total review count for each anime movie

anime_ratingCount = (anime_dataset.
     groupby(by = ['workName'])['overallRating'].
     count().
     reset_index().
     rename(columns = {'overallRating': 'totalRatingCount'})
     [['workName', 'totalRatingCount']]
    )

anime_ratingCount.head(20)

Unnamed: 0,workName,totalRatingCount
0,0,3
1,1,8
2,100,1
3,1989,1
4,2010,1
5,663114,3
6,009-1,8
7,009_Re_Cyborg,16
8,00_08,23
9,07-Ghost,75


In [0]:
rating_with_totalRatingCount = anime_dataset.merge(anime_ratingCount, left_on = 'workName', right_on = 'workName', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,id,workId,reviewId,workName,postTime,episodesSeen,author,UserId,peopleFoundUseful,overallRating,storyRating,animationRating,soundRating,characterRating,enjoymentRating,review,totalRatingCount
0,1017626,22429,299562,Itadaki_Seiekiâ™¥,2019-01-20 07:47:00,1,Sekaii-San,101,1,4,2,7,5,4,7,Sorry for my bad English ^.^ ( 80% Google tran...,13
1,1017400,35366,299647,Eiga_Ensetsu_Seiji_no_Rinrika_Gotou_Shinpei_1926,2019-01-20 20:25:00,1,matthigh,102,1,4,0,0,0,0,0,"As far as the 'animation' goes, this 30-minute...",1
2,1017555,11827,299698,Kuroinu__Kedakaki_Seijo_wa_Hakudaku_ni_Somaru,2019-01-21 11:34:00,6,Balddog,103,1,3,3,8,8,6,3,"While I have to admit, the art was really good...",21
3,1018890,3050,299893,Panchira_Teacher,2019-01-23 21:29:00,2,DarkWolf6211,104,1,2,1,8,7,2,1,"~MAY CONTAIN SOME SPOILERS~Look, I've seen man...",2
4,1017910,3220,300241,Kanashimi_no_Belladonna,2019-01-28 09:33:00,1,ZephSilver,105,27,7,7,9,6,9,9,"Spellbound in a whirlwind of love, sex, desire...",20


In [0]:
# Let’s look at the statistics of total rating count
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(anime_ratingCount['totalRatingCount'].describe())

count   7868.000
mean      17.184
std       50.194
min        1.000
25%        1.000
50%        3.000
75%       11.000
max     1285.000
Name: totalRatingCount, dtype: float64


In [0]:
# Let’s look at the top of the distribution
print(anime_ratingCount['totalRatingCount'].quantile(np.arange(.9, 1, .01)))

0.900    40.000
0.910    43.000
0.920    49.000
0.930    56.000
0.940    64.000
0.950    75.000
0.960    90.000
0.970   114.000
0.980   149.000
0.990   218.660
Name: totalRatingCount, dtype: float64


In [0]:
# setting a popularity threshold of 30
popularity_threshold = 30
rating_popular_anime = rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_anime.head()

Unnamed: 0,id,workId,reviewId,workName,postTime,episodesSeen,author,UserId,peopleFoundUseful,overallRating,storyRating,animationRating,soundRating,characterRating,enjoymentRating,review,totalRatingCount
8,1020098,1639,300463,Boku_no_Pico,2019-01-31 04:09:00,1,Twiverse,108,1,1,2,4,1,1,1,That was painful to watch. Not gonna lie.Boku ...,132
10,1020097,1639,300759,Boku_no_Pico,2019-02-02 21:12:00,1,bokunopussy,110,1,10,7,10,9,10,10,"This... This anime... Or is it a hentai? No, t...",132
16,1002895,32379,300951,Berserk,2019-02-05 07:26:00,12,Jordanious77,115,2,7,7,7,6,7,8,"WARNING: This is a review of THE ACTUAL ANIME,...",102
19,1001158,35203,301022,Isekai_wa_Smartphone_to_Tomo_ni,2019-02-06 10:29:00,12,Nitoni,118,2,10,10,9,9,10,10,So I know why people think the anime is bad/cl...,202
23,1000622,19429,301057,Akuma_no_Riddle,2019-02-06 22:07:00,12,Lonlily,122,0,3,3,4,4,4,3,Akuma no Riddle has a dark and alluring plot t...,81


In [0]:
rating_popular_anime.shape

(99125, 17)

In [0]:
# changing UserID datatype to integer
rating_popular_anime["UserId"] = rating_popular_anime["UserId"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [0]:
rating_popular_anime.dtypes

id                            int64
workId                        int64
reviewId                      int64
workName                     object
postTime             datetime64[ns]
episodesSeen                 object
author                       object
UserId                        int64
peopleFoundUseful             int64
overallRating                 int64
storyRating                   int64
animationRating               int64
soundRating                   int64
characterRating               int64
enjoymentRating               int64
review                       object
totalRatingCount              int64
dtype: object

In [0]:
# Forming a matrix between UserId and workName
from scipy.sparse import csr_matrix
anime_rating_pivot = rating_popular_anime.pivot_table(values='overallRating', index='workName', columns='UserId').fillna(0)
anime_rating_matrix = csr_matrix(anime_rating_pivot.values)


In [0]:
anime_rating_pivot

UserId,101,102,103,105,107,108,110,111,113,115,116,117,118,119,122,123,124,127,128,130,132,133,135,136,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,...,46831,46832,46833,46834,46835,46836,46837,46838,46839,46840,46841,46842,46843,46844,46845,46846,46847,46848,46849,46850,46851,46852,46853,46854,46855,46856,46857,46858,46859,46860,46861,46862,46863,46864,46865,46866,46867,46868,46869,46870
workName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
07-Ghost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11eyes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18if,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3-gatsu_no_Lion,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3-gatsu_no_Lion_2nd_Season,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zombieland_Saga,0.0,0.0,0.0,0.0,6.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ef__A_Tale_of_Melodies,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ef__A_Tale_of_Memories,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hack__Sign,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Nearest Neighbor using brute algorithm and cosine metric

In [0]:
# The algorithm we use to compute the nearest neighbors is “brute”, 
# and we specify “metric=cosine” so that the algorithm will calculate 
# the cosine similarity between rating vectors. Finally, we fit the model.

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = "brute", radius=0.5, n_neighbors=15, leaf_size=30, p=2.5)
model_knn.fit(anime_rating_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=15, p=2.5,
                 radius=0.5)

In [0]:
query_index = np.random.choice(anime_rating_pivot.shape[0])
distances, indices = model_knn.kneighbors(anime_rating_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(anime_rating_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, anime_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Mahouka_Koukou_no_Rettousei:

1: Trinity_Seven, with distance of 0.9023246790207741:
2: Madan_no_Ou_to_Vanadis, with distance of 0.924606215982846:
3: Nejimaki_Seirei_Senki__Tenkyou_no_Alderamin, with distance of 0.9307694234888306:
4: Jitsu_wa_Watashi_wa, with distance of 0.9429259487845205:
5: Black_Bullet, with distance of 0.9441964040938917:


## Nearest Neighbor using kd_tree algorithm and euclidean metric

In [0]:
# Fitting the model using kd_tree algorithm and euclidean similarity

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'euclidean', algorithm = "kd_tree", radius=0.5, n_neighbors=15, leaf_size=30, p=2.5)
model_knn.fit(anime_rating_matrix)



NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='euclidean',
                 metric_params=None, n_jobs=None, n_neighbors=15, p=2.5,
                 radius=0.5)

In [0]:
query_index = np.random.choice(anime_rating_pivot.shape[0])
distances, indices = model_knn.kneighbors(anime_rating_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(anime_rating_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, anime_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Girls_Bravo__First_Season:

1: Kenja_no_Mago, with distance of 41.211194424739524:
2: Makura_no_Danshi, with distance of 42.09513035969837:
3: Utsu_Musume_Sayuri, with distance of 42.81354925721529:
4: Divine_Gate, with distance of 44.74371464239419:
5: Rewrite, with distance of 46.357307945997036:


# Query based recommendation Module using TFIDF features of the reviews

In [12]:
# Select columns 'workId','workName', 'UserId', 'overallRating', 'genres', 'review', 'postTime' from the original merged dataset
my_anime_df = new_anime_df[['workId','workName', 'UserId', 'overallRating', 'genres', 'review', 'postTime']]

# changing the datatype of review to string
my_anime_df['review'] = my_anime_df['review'].astype(str)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [15]:
# Function for cleaning the reviews by removing stopwords and punctuations
def clean_review(mess):
  return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", mess).split())

# Applying the clean_review function to column review
my_anime_df['review'] = my_anime_df['review'].apply(clean_review)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
# Applying the clean_review function to column genres
my_anime_df['genres'] = my_anime_df['genres'].apply(clean_review)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [0]:
subset_df = my_anime_df.sample(n=30000, replace=False, random_state=1, axis=0)

In [0]:
# Creating dataframes of reviews with User ID
userid_df = subset_df[['UserId','review']]

# Creating dataframes of reviews with Work ID
work_df = subset_df[['workId', 'review']]

In [0]:
# Combining all the reviews to form a single paragraph for each User ID
u_df = userid_df.groupby('UserId').agg({'review': ' '.join})

# Combining all the reviews to form a single paragraph for each Work ID
w_df = work_df.groupby('workId').agg({'review': ' '.join})

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Applying the TFIDF Vectorizer to extract the features from the text for each user review.
#userid vectorizer
userid_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=501)
userid_vectors = userid_vectorizer.fit_transform(u_df['review'])

# Applying the TFIDF Vectorizer to extract the features from the text for each movie review.
#Business id vectorizer
workid_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=501)
workid_vectors = workid_vectorizer.fit_transform(w_df['review'])

In [0]:
# Creating a matrix of users and movies with the ratings.
userid_rating_matrix = pd.pivot_table(subset_df, values='overallRating', index=['UserId'], columns=['workId'])

In [0]:
def matrix_factorization(R, P, Q, steps=25, gamma=0.001,lamda=0.02):
    for step in range(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    eij=R.loc[i,j]-np.dot(P.loc[i],Q.loc[j])
                    P.loc[i]=P.loc[i]+gamma*(eij*Q.loc[j]-lamda*P.loc[i])
                    Q.loc[j]=Q.loc[j]+gamma*(eij*P.loc[i]-lamda*Q.loc[j])
        e=0
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    e= e + pow(R.loc[i,j]-np.dot(P.loc[i],Q.loc[j]),2)+lamda*(pow(np.linalg.norm(P.loc[i]),2)+pow(np.linalg.norm(Q.loc[j]),2))
        if e<0.001:
            break
        
    return P,Q


In [0]:
P1 = pd.DataFrame(userid_vectors.toarray(), index=u_df.index, columns=userid_vectorizer.get_feature_names())
Q1 = pd.DataFrame(workid_vectors.toarray(), index=w_df.index, columns=workid_vectorizer.get_feature_names())


In [0]:
P, Q = matrix_factorization(userid_rating_matrix, P, Q, steps=5, gamma=0.001,lamda=0.02)

In [0]:
## reading the dataframes from saved locations --this was to ensure that we dont lose the model outputs

P.to_csv("P.csv")
Q.to_csv("Q.csv")

P = pd.read_csv('P.csv')
Q = pd.read_csv('Q.csv')

In [0]:
Px = P
Qx = Q

In [0]:
Px.index = Px.iloc[:, 1]
Px = Px.drop(['Unnamed: 0', 'UserId'], axis=1)


In [45]:
Qx.head(10)

Unnamed: 0_level_0,Unnamed: 0,workId,1,10,2,3,4,5,6,7,8,9,a,able,about,absolutely,acting,action,actually,after,again,all,almost,along,already,also,although,always,am,amazing,amount,an,and,animation,anime,animes,annoying,another,any,anyone,...,want,wanted,wants,war,was,wasn,watch,watched,watching,way,we,well,went,were,what,when,where,which,while,who,whole,why,will,with,within,without,won,work,works,world,worth,would,writing,wrong,year,years,yes,yet,you,your
workId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,0,1,0.028717,0.182317,0.033784,0.023873,0.0222,0.047296,0.030037,0.020204,0.043044,0.067626,0.702545,0.013909,0.15103,0.03199,0.032911,0.068121,0.039348,0.065535,0.040711,0.234002,0.045875,0.033499,0.030517,0.067408,0.024324,0.040949,0.03361,0.099483,0.009985,0.197237,0.886357,0.144614,0.368689,0.044557,0.023958,0.046196,0.078055,0.036986,...,0.024568,0.019133,0.01251,0.008251,0.248494,0.047824,0.110001,0.071347,0.074552,0.102289,0.077338,0.165911,0.011989,0.071796,0.145386,0.101711,0.058654,0.121774,0.064256,0.102857,0.04192,0.071311,0.090403,0.279626,0.013033,0.042074,0.023853,0.066398,0.029301,0.044152,0.012195,0.074105,0.029657,0.014815,0.018093,0.075613,0.015968,0.045375,0.352292,0.046628
5,1,5,0.012414,0.055749,0.012602,0.013261,0.020999,0.024396,0.002536,0.016081,0.031579,0.02691,0.362476,0.005514,0.04177,0.002462,0.033684,0.038576,0.013117,0.038429,0.007374,0.044968,0.023533,0.002669,0.016209,0.028141,0.017677,0.027733,0.00902,0.015076,0.007703,0.048348,0.32831,0.059377,0.061317,0.000274,0.00881,0.013955,0.018264,0.002184,...,0.010181,0.011789,0.000683,0.009525,0.099801,0.006536,0.031314,0.028963,0.045007,0.012131,0.031509,0.048581,0.001226,0.026644,0.067235,0.030143,0.011729,0.034195,0.015544,0.038084,0.01055,0.026427,0.027312,0.099636,0.0014,0.015879,0.012883,0.019388,0.001397,0.017982,0.002669,0.04708,0.008596,0.00619,0.008547,0.007507,0.007963,0.020449,0.09821,0.011552
6,2,6,0.017043,0.045024,0.020113,0.006092,0.001719,0.008028,0.003604,0.01184,0.008167,0.004074,0.473382,0.009963,0.062943,0.004591,0.023997,0.04178,0.031793,0.040447,0.016944,0.063632,0.010849,0.023731,0.007046,0.05426,0.012625,0.037432,0.014327,0.03924,0.020602,0.104168,0.572985,0.048989,0.165951,0.023361,0.015222,0.015272,0.035695,0.015487,...,0.0204,0.007145,0.009064,0.014135,0.172235,0.022284,0.064248,0.022829,0.03624,0.052905,0.04595,0.065276,0.018755,0.042336,0.051438,0.082522,0.038747,0.078581,0.053548,0.074175,0.015541,0.025775,0.0374,0.177207,0.014437,0.022151,0.017636,0.019314,0.012088,0.028246,0.033653,0.044206,0.01523,0.0069,0.006006,0.011468,0.023664,0.054064,0.15034,0.032864
7,3,7,0.016813,0.026989,0.026058,0.000492,0.000365,0.000575,0.000462,0.000473,0.016026,0.000355,0.218396,0.025847,0.041969,0.000213,0.000866,0.035695,0.011799,0.018939,0.011028,0.06637,0.012235,0.026251,0.012418,0.009914,0.000451,0.029715,0.012419,0.016829,0.000151,0.041404,0.304204,0.01434,0.111558,0.018841,0.013606,0.023943,0.011199,0.012588,...,0.020737,0.001084,0.000529,0.004176,0.064528,0.000309,0.045419,0.024725,0.009698,0.018935,0.064342,0.04298,0.003452,0.03948,0.041425,0.011488,0.03235,0.030019,0.0229,0.018621,0.03367,0.031716,0.043471,0.103811,0.000141,0.001063,0.000295,0.030938,0.000482,0.024892,0.015844,0.030795,0.000432,0.013253,0.000432,0.000792,0.001818,0.012228,0.156331,0.013261
15,4,15,3.8e-05,0.016777,0.026853,0.019823,0.01162,0.03183,0.030333,0.009631,0.03796,0.010075,0.236427,0.023236,0.095438,2.8e-05,0.033684,0.024083,0.025227,0.008967,3.8e-05,0.056084,0.009034,3.5e-05,2.7e-05,0.028571,0.001176,0.011173,5.4e-05,4.7e-05,0.011012,0.05191,0.26723,0.021699,0.275279,0.001269,0.01377,0.026482,0.014956,3e-05,...,0.016342,2.5e-05,1.3e-05,1.2e-05,0.092926,0.020077,0.05863,0.013285,0.035365,0.050611,0.017132,0.0705,1.5e-05,0.007245,0.027848,0.035491,0.008348,0.024615,0.01491,0.017939,0.043642,0.02621,0.020742,0.084704,1.7e-05,4.7e-05,0.020285,3.8e-05,1.6e-05,0.000983,0.016621,0.057562,0.011423,2.2e-05,4.5e-05,0.013016,0.035263,0.020329,0.094831,0.009723
16,5,16,0.010642,0.031909,0.017194,0.008032,0.009993,0.015814,0.004088,0.008067,0.016605,0.024869,0.325601,0.014757,0.086108,0.000264,0.004803,0.012563,0.005112,0.038781,0.020769,0.065596,0.032326,0.010642,0.014543,0.064984,0.004365,0.0439,0.019416,0.029759,0.005367,0.047449,0.448558,0.038022,0.151139,0.012629,0.005595,0.029537,0.024124,0.021652,...,0.02786,0.005072,0.007097,0.000729,0.065655,0.00118,0.064658,0.015338,0.01419,0.042626,0.055932,0.054085,0.005593,0.02363,0.062845,0.031265,0.027053,0.034892,0.040729,0.064059,0.015591,0.019576,0.094008,0.130298,6.8e-05,0.011371,0.012221,0.009126,0.011893,0.014454,0.001258,0.010966,0.018093,0.005174,0.00081,0.022117,0.000713,0.018647,0.181337,0.029523
17,6,17,0.009906,0.009464,0.008285,0.000358,0.009783,0.02599,2.9e-05,0.000179,0.000193,0.002471,0.263457,0.001,0.039587,0.023606,0.004247,0.015179,0.030627,0.028144,0.008925,0.061301,0.016662,0.002063,0.009674,0.048883,0.042369,0.016569,0.016171,0.009784,0.029363,0.041149,0.285227,0.057838,0.271972,0.0518,0.019478,0.008707,0.010212,0.000654,...,0.002301,5.1e-05,0.000722,5.5e-05,0.078052,0.008947,0.0653,0.031915,0.037192,0.026224,0.001393,0.041552,5.2e-05,0.014314,0.03227,0.048685,0.008433,0.028307,0.003387,0.009782,0.000698,0.00103,0.035928,0.108921,0.000159,0.002676,0.020629,0.008076,0.000251,0.024528,0.031585,0.014757,0.000358,0.018398,0.012756,0.001734,0.00046,0.001838,0.203546,0.04973
19,7,19,0.011601,0.118479,0.025212,0.016284,0.010999,0.020088,0.010016,0.012886,0.026466,0.047002,0.636384,0.044563,0.082172,0.017261,0.010583,0.023113,0.036212,0.056161,0.023882,0.134302,0.03975,0.023108,0.00827,0.072723,0.032865,0.031634,0.014245,0.057351,0.008788,0.144339,0.664233,0.059034,0.215796,0.032431,0.008456,0.030032,0.06279,0.022386,...,0.032231,0.014525,0.016072,0.005542,0.16888,0.018077,0.07455,0.065518,0.073148,0.053056,0.047955,0.101508,0.017188,0.053832,0.139658,0.060663,0.056174,0.082505,0.10945,0.12282,0.038551,0.026049,0.115343,0.183739,0.020742,0.029177,0.012955,0.042845,0.016591,0.046917,0.02309,0.073863,0.035796,0.020866,0.017411,0.029753,0.016433,0.040405,0.273989,0.055285
20,8,20,0.049683,0.160619,0.02456,0.048137,0.046126,0.024804,0.023994,0.033377,0.051322,0.036696,0.552664,0.014616,0.161549,0.02131,0.01905,0.062384,0.035418,0.054559,0.03263,0.170545,0.030664,0.020007,0.017778,0.113682,0.020462,0.059834,0.023347,0.092755,0.040612,0.101325,0.728442,0.072122,0.3535,0.055527,0.055012,0.016951,0.044951,0.027319,...,0.051795,0.00673,0.021099,0.003004,0.281917,0.023578,0.125985,0.062841,0.092633,0.069551,0.041955,0.118255,0.010338,0.093892,0.106535,0.110378,0.045364,0.077045,0.034615,0.076633,0.057776,0.060019,0.110881,0.195723,0.013668,0.024152,0.024011,0.02367,0.007383,0.068001,0.03822,0.055833,0.012085,0.022707,0.006829,0.031041,0.01739,0.018003,0.296616,0.043622
21,9,21,0.039353,0.149455,0.03905,0.08064,0.022256,0.032963,0.02844,0.025663,0.041231,0.064659,0.647779,0.018327,0.143072,0.035116,0.013085,0.092618,0.05378,0.101268,0.033855,0.179614,0.04242,0.04039,0.032516,0.077412,0.028928,0.061355,0.064801,0.108611,0.018206,0.12451,0.863786,0.081499,0.366342,0.057354,0.016358,0.038589,0.068568,0.034371,...,0.082555,0.018793,0.013142,0.012873,0.221203,0.013831,0.188815,0.058629,0.132184,0.08179,0.045707,0.128102,0.017268,0.052911,0.13729,0.148599,0.054873,0.107281,0.044548,0.125866,0.046778,0.079153,0.111012,0.24362,0.016774,0.035312,0.028708,0.021383,0.016094,0.120512,0.026611,0.101817,0.013905,0.011258,0.023346,0.053913,0.018513,0.024233,0.408107,0.084852


In [0]:
Qx.index = Qx.iloc[:, 1]
Qx = Qx.drop(['Unnamed: 0', 'workId'], axis=1)

In [56]:
Qx.T.shape

(501, 4684)

In [0]:
# give me recommendation for movies for kids!

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

words = "suggest movie for kids"
pred_df= pd.DataFrame([words], columns=['review'])
pred_df['review'] = pred_df['review'].apply(clean_review)
pred_vectors = userid_vectorizer.transform(pred_df['review'])
pred_v_df = pd.DataFrame(pred_vectors.toarray(), index=pred_df.index, columns=userid_vectorizer.get_feature_names())

In [73]:
predItemRating=pd.DataFrame(np.dot(pred_v_df.loc[0],Qx.T),index=Qx.index,columns=['Rating'])
topRecom=pd.DataFrame.sort_values(predItemRating,['Rating'],ascending=[0])[:4]

print("USER QUERY : ",words+"\n")
for i in topRecom.index:
  print(my_anime_df[my_anime_df['workId']==i]['workName'].iloc[0])
  print(my_anime_df[my_anime_df['workId']==i]['genres'].iloc[0])
  print("userRating: "+ str(my_anime_df[my_anime_df['workId']==i]['overallRating'].iloc[0])+"\n")


USER QUERY :  suggest movie for kids

Kimi_no_Na_wa
Romance Supernatural School Drama
userRating: 9

Koe_no_Katachi
Drama School Shounen
userRating: 5

Pokemon_Best_Wishes__Victini_to_Shiroki_Eiyuu_Reshiram
Adventure Comedy Drama Fantasy Kids
userRating: 5

Suzumiya_Haruhi_no_Shoushitsu
Comedy Mystery Romance School Sci Fi Supernatural
userRating: 10



In [71]:
# give me recommendation for movie with cars!

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

words = "suggest movie about cars"
pred_df= pd.DataFrame([words], columns=['review'])
pred_df['review'] = pred_df['review'].apply(clean_review)
pred_vectors = userid_vectorizer.transform(pred_df['review'])
pred_v_df = pd.DataFrame(pred_vectors.toarray(), index=pred_df.index, columns=userid_vectorizer.get_feature_names())

predItemRating=pd.DataFrame(np.dot(pred_v_df.loc[0],Qx.T),index=Qx.index,columns=['Rating'])
topRecom=pd.DataFrame.sort_values(predItemRating,['Rating'],ascending=[0])[2:5]

print("USER QUERY : ",words+"\n")
for i in topRecom.index:
  print(my_anime_df[my_anime_df['workId']==i]['workName'].iloc[0])
  print(my_anime_df[my_anime_df['workId']==i]['genres'].iloc[0])
  print("userRating: "+ str(my_anime_df[my_anime_df['workId']==i]['overallRating'].iloc[0])+"\n")

USER QUERY :  suggest movie about cars

Biohazard_4D-Executer
Action Adventure Military Horror
userRating: 4

Suzumiya_Haruhi_no_Shoushitsu
Comedy Mystery Romance School Sci Fi Supernatural
userRating: 10

Detective_Conan_Movie_07__Crossroad_in_the_Ancient_Capital
Adventure Mystery Comedy Police Shounen
userRating: 7



In [74]:
# give me recommendation for horror movie!

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

words = "suggest a horror movie"
pred_df= pd.DataFrame([words], columns=['review'])
pred_df['review'] = pred_df['review'].apply(clean_review)
pred_vectors = userid_vectorizer.transform(pred_df['review'])
pred_v_df = pd.DataFrame(pred_vectors.toarray(), index=pred_df.index, columns=userid_vectorizer.get_feature_names())

predItemRating=pd.DataFrame(np.dot(pred_v_df.loc[0],Qx.T),index=Qx.index,columns=['Rating'])
topRecom=pd.DataFrame.sort_values(predItemRating,['Rating'],ascending=[0])[2:6]

print("USER QUERY : ",words+"\n")
for i in topRecom.index:
  print(subset_df[subset_df['workId']==i]['workName'].iloc[0])
  print(subset_df[subset_df['workId']==i]['genres'].iloc[0])
  print("userRating: "+ str(subset_df[subset_df['workId']==i]['overallRating'].iloc[0])+"\n")

USER QUERY :  suggest a horror movie

Byousoku_5_Centimeter
Drama Romance Slice of Life
userRating: 10

Suzumiya_Haruhi_no_Shoushitsu
Comedy Mystery Romance School Sci Fi Supernatural
userRating: 9

Ookami_Kodomo_no_Ame_to_Yuki
Fantasy Slice of Life
userRating: 10

Detective_Conan_Movie_07__Crossroad_in_the_Ancient_Capital
Adventure Mystery Comedy Police Shounen
userRating: 7

