In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Motivation: data is large

Our data is around 2 GB. It would be cumbersome to analyse it using traditional instruments for flat table data such as pandas. Therefore let's try using Apache Spark (or it's Python wrapper: PySpark) to get around this problem.

### First, let's install PySpark with pip and import all the necessary functions:

In [None]:
!pip install pyspark

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

from pyspark.sql.types import *
from pyspark.sql import Window
from pyspark.sql import functions as f


from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline


from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('anime-data').getOrCreate()

### It is preferable to expicitly set the structure of data which we'll be reading with PySpark. It can be done by defining schemas. Basically, schema is a list of columns to be found in our table data as well as types of each column.

In [None]:
anime_schema = StructType(fields=[StructField('MAL_ID', IntegerType()),
                           StructField('name', StringType()),
                            StructField('score', DoubleType()),
                            StructField('genres', StringType()),
                            StructField('synopsis', StringType())
                           ])


animelist_schema = StructType(fields=[StructField('user_id', IntegerType()),
                           StructField('anime_id', IntegerType()),
                            StructField('rating', IntegerType()),
                            StructField('watching_status', IntegerType()),
                            StructField('watched_episodes', IntegerType())
                           ])

### Next, let's read csv-files proving our pre-defined schemas.

In [None]:
anime = spark.read.csv('/kaggle/input/anime-recommendation-database-2020/anime_with_synopsis.csv', 
              schema=anime_schema,
              sep=',',
              header=True)

animelist = spark.read.csv('/kaggle/input/anime-recommendation-database-2020/animelist.csv', 
              schema=animelist_schema,
              sep=',',
              header=True)

Hmmm... it might seem odd that 2 GB data were read so quickly. Actually, so far Spark hasn't read anything. Spark implements so-called 'lazy evaluation' which basically means that it won't handle any actual data untill we ask it to do so. In another words, we can define multiple 'transformations' which we want to make on our data, but they will be actually implemented only when we call an 'action' function.


For example, showing N rows of the resulting DataFrame is an 'action'. Calling .show() method will trigger execution of the chain of transformations which we defined previously, so that actual 'computation' takes place.

In [None]:
anime.show(5)

### Subsetting data, filtering, grouping, aggregating the results of grouping, ordering, renaming columns etc. - all these typical operations are done by Spark in a 'lazy' manner. The actual computation of transformation will occur only when calling an action.

### For example, this way we can get the top-10 scored anime in our data:

In [None]:
best_anime = anime\
            .filter('score is not null')\
            .withColumnRenamed('MAL_ID', 'anime_id')\
            .orderBy('score', ascending=False)\
            .cache()

best_anime.show(10)

### Conveniently, Spark DataFrames can be exported into our favorite pandas DataFrame at anytime:

In [None]:
best_anime_pandas = best_anime.toPandas()
best_anime_pandas.head(10)

### Now, let's find out what are the most popular anime genres in our data. Also let's calculate the average score of each genre and it's share among all the data.

In [None]:
popular_genres = anime.withColumn('genres_list', f.split('genres', ','))\
                     .withColumn('genre', f.explode('genres_list'))\
                     .groupBy('genre')\
                     .agg(f.count(f.col('name')).alias('genre_count'), f.avg(f.col('score')).alias('avg_score'))\
                     .orderBy('genre_count', ascending=False)\
                    .withColumn('pct_of_total', f.col('genre_count') / f.sum('genre_count').over(Window.partitionBy()))\
                    .withColumn('pct_of_total', f.col('pct_of_total') * 100)\
                    .cache()

popular_genres.show(10)

### Pct_of_total adds up to 100% which is a sign that we've calculated this column correctly:

In [None]:
popular_genres.select(f.sum(popular_genres['pct_of_total']).alias('pct_sum')).show()

### Let's do something more complicated: 

**1) Find the top 10% of anime by score;**

**2) Get the distribution of genres among top 10%;**

**3) Compare this distribution with the one of all anime. List anime titles with the greatest absolute difference in distributions**

In [None]:
anime_10th_ptile = anime.filter('score is not null')\
            .select('name', 'genres', 'score', f.percent_rank().over(Window.partitionBy().orderBy(anime['score'])).alias('pct_rank'))\
            .filter('pct_rank >= 0.9')\

popular_genres_10th_ptile = anime_10th_ptile.withColumn('genres_list', f.split('genres', ','))\
             .withColumn('genre', f.explode('genres_list'))\
             .groupBy('genre')\
             .agg(f.count(f.col('name')).alias('genre_count'), f.avg(f.col('score')).alias('avg_score'))\
             .orderBy('genre_count', ascending=False)\
            .withColumn('pct_of_total', f.col('genre_count') / f.sum('genre_count').over(Window.partitionBy()))\
            .withColumn('pct_of_total', f.col('pct_of_total') * 100)\
            .cache()

popular_compare = popular_genres[['genre', 'pct_of_total']].withColumnRenamed('pct_of_total', 'pct_of_total_left')\
    .join(popular_genres_10th_ptile[['genre', 'pct_of_total']].withColumnRenamed('pct_of_total', 'pct_of_total_right'), on='genre', how='left')\
    .withColumn('abs_difference', f.abs(f.col('pct_of_total_right') - f.col('pct_of_total_left')))\
    .orderBy('abs_difference', ascending=False)

popular_compare.show(10)

### Genres, which are not present among the top 10% anime (by score):

In [None]:
popular_compare.select('genre').filter(f.col('pct_of_total_right').isNull()).show(10)

### Let's analyse really big table data, which was in *anime-recommendation-database-2020/animelist.csv* . This dataframe contains evaluations of anime titles by each user.

In [None]:
animelist.show(5)

### Let's group by anime_id and calculate some useful stats:

In [None]:
most_watched_anime = animelist.groupBy('anime_id')\
    .agg(f.count(f.col('user_id')).alias('user_cnt'),
         f.mean(f.col('rating')).alias('mean_rating'),
         f.stddev(f.col('rating')).alias('std_rating'),
         f.percentile_approx(f.col('rating'), 0.5).alias('median_rating'),
         f.mean(f.col('watched_episodes')).alias('mean_num_episodes'))\
    .orderBy('user_cnt', ascending=False)\
    .cache()
    
    

In [None]:
%%time
most_watched_anime.show(10)

It really takes long time to process...

### Let's join the resulting DataFrame with another one and sort the result by user count

In [None]:
anime_joined = best_anime\
    .join(most_watched_anime, on='anime_id', how='inner')\
    .orderBy('user_cnt', ascending=False)\
    .cache()

anime_joined.show(3, truncate=False, vertical=True)

Spark allows not only for pandas-like data wrangling, but also inference with standard ML-models.

For example, let's try to fit a simple linear regression model.

First, we encode each genre an anime has into 1/0:

In [None]:
genres_list = [g['genre'].strip() for g in popular_genres.select('genre').collect()]

In [None]:
print('Unique genres: %s' % len(genres_list))

We'll use sklearn MultiLabelBinarizer. We fit 83 genres we have into it.

In [None]:
ml_bin = MultiLabelBinarizer()
ml_bin.fit([genres_list])

print(ml_bin.classes_[:10])
print(len(ml_bin.classes_))

Next, we define a custom UDF (User-Defined Function) that we'll feed into Spark in a groupby statement.

In [None]:
def binarize_genres(entry, binarizer):
    entry_list = [el.strip() for el in entry.split(', ')]
    entry_list = [el for el in entry_list if el != '']
    entry_tpl = tuple(entry_list)
    vector = binarizer.transform([entry_tpl])
    return [int(i) for i in vector[0]]

In [None]:
binarize_genres_udf = f.udf(lambda x: binarize_genres(x, ml_bin), returnType=ArrayType(IntegerType()))

In [None]:
data = anime_joined\
    .withColumn('genres_binarized', binarize_genres_udf(f.col('genres')))\
    .select(['anime_id', 'genres_binarized', 'user_cnt', 'mean_num_episodes', 'score'])\
    .cache()

for idx, genre_name in enumerate(ml_bin.classes_):
    data = data.withColumn(f'genre_{idx}', f.col('genres_binarized').getItem(idx))

In [None]:
print(data.columns)

In [None]:
data = data.drop('genres_binarized')

When using SparkML (built-in machine-learning models' classes in Spark), you have to input your data in form of a one big Vector-column.

In [None]:
assembler = VectorAssembler(inputCols=data.drop('score').columns,
                            outputCol='features')

linreg = LinearRegression(featuresCol='features', labelCol='score')

pipeline = Pipeline(stages=[assembler,
                           linreg])

In [None]:
pipeline_model = pipeline.fit(data)

In [None]:
result = pipeline_model.transform(data)

The results seem to be mediocre - but the purpose of this exerise was to familiarize ourselves with PySpark functionality and not to make a flawless ML-model.

In [None]:
result.select('prediction', 'score').sample(0.1).show()

### Not forget to stop SparkSession in the end!

In [None]:
spark.stop()