In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [0]:
!wget -q www-us.apache.org/dist/spark/spark-2.4.3/spark-2.4.3-bin-hadoop2.7.tgz

In [0]:
!tar -xvf spark-2.4.3-bin-hadoop2.7.tgz

In [0]:
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.3-bin-hadoop2.7"

In [0]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pandas.plotting import scatter_matrix
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression

import warnings
warnings.simplefilter('ignore')

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

spark = SparkSession.builder.master("local[*]").getOrCreate()

## 7. Загрузить данные в Spark

In [0]:
df_data = spark.read.csv("u.data", sep='\t', header=None, inferSchema=True)
df_genre = spark.read.csv("u.genre", sep='|', header=None, inferSchema=True)
df_info = spark.read.csv("u.info", sep=' ', header=None, inferSchema=True)
df_occupation = spark.read.csv("u.occupation", sep=' ', header=None, inferSchema=True)
df_user = spark.read.csv("u.user", sep='|', header=None, inferSchema=True)
df_item = spark.read.csv("u.item", sep='|', header=None, inferSchema=True)
# encoding='latin_1'

In [0]:
new_names_df_data = ['user_id', 'movie_id', 'rating', 'timestamp']
df_data = df_data.toDF(*new_names_df_data)

new_names_df_genre = ['genres', 'genres_id']
df_genre = df_genre.toDF(*new_names_df_genre)

new_names_df_user = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
df_user = df_user.toDF(*new_names_df_user)

new_names_df_item = ['movie_id', 'movie_title', 'release_date', 
                     'video_release_date', 'IMDb_URL', 'unknown', 
                     'Action', 'Adventure', 'Animation', "Children's", 
                     'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
                     'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
                     'Sci-Fi', 'Thriller', 'War', 'Western']
df_item = df_item.toDF(*new_names_df_item)

## 8. Средствами спарка вывести среднюю оценку для каждого фильма

In [0]:
df_data.show()

+-------+--------+------+---------+
|user_id|movie_id|rating|timestamp|
+-------+--------+------+---------+
|    196|     242|     3|881250949|
|    186|     302|     3|891717742|
|     22|     377|     1|878887116|
|    244|      51|     2|880606923|
|    166|     346|     1|886397596|
|    298|     474|     4|884182806|
|    115|     265|     2|881171488|
|    253|     465|     5|891628467|
|    305|     451|     3|886324817|
|      6|      86|     3|883603013|
|     62|     257|     2|879372434|
|    286|    1014|     5|879781125|
|    200|     222|     5|876042340|
|    210|      40|     3|891035994|
|    224|      29|     3|888104457|
|    303|     785|     3|879485318|
|    122|     387|     5|879270459|
|    194|     274|     2|879539794|
|    291|    1042|     4|874834944|
|    234|    1184|     2|892079237|
+-------+--------+------+---------+
only showing top 20 rows



In [0]:
df_data.dtypes

[('user_id', 'int'),
 ('movie_id', 'int'),
 ('rating', 'int'),
 ('timestamp', 'int')]

In [0]:
df_data.describe().show()

+-------+------------------+------------------+------------------+-----------------+
|summary|           user_id|          movie_id|            rating|        timestamp|
+-------+------------------+------------------+------------------+-----------------+
|  count|            100000|            100000|            100000|           100000|
|   mean|         462.48475|         425.53013|           3.52986|8.8352885148862E8|
| stddev|266.61442012750905|330.79835632558473|1.1256735991443214|5343856.189502848|
|    min|                 1|                 1|                 1|        874724710|
|    max|               943|              1682|                 5|        893286638|
+-------+------------------+------------------+------------------+-----------------+



In [0]:
umr_mean = df_data.groupby('movie_id').avg('rating')
umr_mean.show(20)

+--------+------------------+
|movie_id|       avg(rating)|
+--------+------------------+
|     496| 4.121212121212121|
|     471|3.6108597285067874|
|     463| 3.859154929577465|
|     148|          3.203125|
|    1342|               2.5|
|     833| 3.204081632653061|
|    1088| 2.230769230769231|
|    1591|3.1666666666666665|
|    1238|             3.125|
|    1580|               1.0|
|    1645|               4.0|
|     392|3.5441176470588234|
|     623| 2.923076923076923|
|     540| 2.511627906976744|
|     858|               1.0|
|     737| 2.983050847457627|
|     243|2.4393939393939394|
|    1025|2.9318181818181817|
|    1084| 3.857142857142857|
|    1127| 2.909090909090909|
+--------+------------------+
only showing top 20 rows



## 9. В Spark получить 2 df с 5-ю самыми популярными и самыми непопулярными фильмами (по количеству оценок, либо по самой оценке)

In [0]:
rating_max = umr_mean.orderBy("avg(rating)", ascending=False)
rating_min = umr_mean.orderBy("avg(rating)", ascending=True)

rating_max.show(5)
rating_min.show(5)

+--------+-----------+
|movie_id|avg(rating)|
+--------+-----------+
|    1122|        5.0|
|    1653|        5.0|
|    1599|        5.0|
|    1201|        5.0|
|    1500|        5.0|
+--------+-----------+
only showing top 5 rows

+--------+-----------+
|movie_id|avg(rating)|
+--------+-----------+
|    1352|        1.0|
|    1618|        1.0|
|    1339|        1.0|
|    1580|        1.0|
|     858|        1.0|
+--------+-----------+
only showing top 5 rows



## 10. Средствами спарка соедините информацию по фильмам и жанрам (u.genre)

In [0]:
df_item.show()
df_genre.show()

+--------+--------------------+------------+------------------+--------------------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+
|movie_id|         movie_title|release_date|video_release_date|            IMDb_URL|unknown|Action|Adventure|Animation|Children's|Comedy|Crime|Documentary|Drama|Fantasy|Film-Noir|Horror|Musical|Mystery|Romance|Sci-Fi|Thriller|War|Western|
+--------+--------------------+------------+------------------+--------------------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+
|       1|    Toy Story (1995)| 01-Jan-1995|              null|http://us.imdb.co...|      0|     0|        0|        1|         1|     1|    0|          0|    0|      0|        0|     0|      0|      0|      0|     0|       0|  0|      0|
|       2|    GoldenEye (1995)| 01-Jan-1995|

Файл df_item уже содержит жанры, не понял данного задания + как соединять ее с df_genre, также не понятно, ведь в фильме может быть несколько жанров + не вижу ключей для соединения.

In [0]:
pass