<a href="https://colab.research.google.com/github/saisumanthkorada/PysprkTransformations/blob/main/Transforming_the_file.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark
!pip install openpyxl




In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import split, col,length,right,expr,regexp_extract,regexp_replace,trim,explode,first,lit
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [4]:
schema = StructType([
    StructField("movieId",IntegerType(),True),
    StructField("title",StringType(),True),
    StructField("generes",StringType(),True)
])
spark = SparkSession.builder.getOrCreate()
df1 = spark.read.format('csv')\
.option('header',True)\
.schema(schema)\
.load('/content/drive/MyDrive/ml-latest/movies.csv')
df1.show()

+-------+--------------------+--------------------+
|movieId|               title|             generes|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [5]:
df1 = df1.withColumn(
  "year",
  regexp_extract(col("title"), r"\((\d{4})\)$", 1)
)
df1=df1.withColumn("title",
    trim(regexp_replace(col("title"), r"\s*\(\d{4}\)$", "")))

In [6]:
df1 = df1.withColumn("genersarray",split(col('generes'), r"\s*\|\s*"))

In [7]:
df1.show()

+-------+--------------------+--------------------+----+--------------------+
|movieId|               title|             generes|year|         genersarray|
+-------+--------------------+--------------------+----+--------------------+
|      1|           Toy Story|Adventure|Animati...|1995|[Adventure, Anima...|
|      2|             Jumanji|Adventure|Childre...|1995|[Adventure, Child...|
|      3|    Grumpier Old Men|      Comedy|Romance|1995|   [Comedy, Romance]|
|      4|   Waiting to Exhale|Comedy|Drama|Romance|1995|[Comedy, Drama, R...|
|      5|Father of the Bri...|              Comedy|1995|            [Comedy]|
|      6|                Heat|Action|Crime|Thri...|1995|[Action, Crime, T...|
|      7|             Sabrina|      Comedy|Romance|1995|   [Comedy, Romance]|
|      8|        Tom and Huck|  Adventure|Children|1995|[Adventure, Child...|
|      9|        Sudden Death|              Action|1995|            [Action]|
|     10|           GoldenEye|Action|Adventure|...|1995|[Action,

In [8]:
df_exploded = df1.withColumn("generes",explode("genersarray"))

In [9]:
df_exploded.show()

+-------+--------------------+---------+----+--------------------+
|movieId|               title|  generes|year|         genersarray|
+-------+--------------------+---------+----+--------------------+
|      1|           Toy Story|Adventure|1995|[Adventure, Anima...|
|      1|           Toy Story|Animation|1995|[Adventure, Anima...|
|      1|           Toy Story| Children|1995|[Adventure, Anima...|
|      1|           Toy Story|   Comedy|1995|[Adventure, Anima...|
|      1|           Toy Story|  Fantasy|1995|[Adventure, Anima...|
|      2|             Jumanji|Adventure|1995|[Adventure, Child...|
|      2|             Jumanji| Children|1995|[Adventure, Child...|
|      2|             Jumanji|  Fantasy|1995|[Adventure, Child...|
|      3|    Grumpier Old Men|   Comedy|1995|   [Comedy, Romance]|
|      3|    Grumpier Old Men|  Romance|1995|   [Comedy, Romance]|
|      4|   Waiting to Exhale|   Comedy|1995|[Comedy, Drama, R...|
|      4|   Waiting to Exhale|    Drama|1995|[Comedy, Drama, R

In [10]:
df_final = df_exploded.groupBy("movieId","title","year").pivot("generes").agg(first(lit("yes")))

In [11]:
l= df_final.columns

In [12]:
for k in l:
  if "(" in k:
    df_final = df_final.drop(k)

In [13]:
df_final = df_final.fillna("no")
df_final.show()

+-------+--------------------+----+------+---------+---------+--------+------+-----+-----------+-----+-------+---------+------+----+-------+-------+-------+------+--------+---+-------+
|movieId|               title|year|Action|Adventure|Animation|Children|Comedy|Crime|Documentary|Drama|Fantasy|Film-Noir|Horror|IMAX|Musical|Mystery|Romance|Sci-Fi|Thriller|War|Western|
+-------+--------------------+----+------+---------+---------+--------+------+-----+-----------+-----+-------+---------+------+----+-------+-------+-------+------+--------+---+-------+
|      4|   Waiting to Exhale|1995|    no|       no|       no|      no|   yes|   no|         no|  yes|     no|       no|    no|  no|     no|     no|    yes|    no|      no| no|     no|
|      5|Father of the Bri...|1995|    no|       no|       no|      no|   yes|   no|         no|   no|     no|       no|    no|  no|     no|     no|     no|    no|      no| no|     no|
|      6|                Heat|1995|   yes|       no|       no|      no|    

In [14]:
df_final = df_final.dropDuplicates()

In [17]:
df_final = df_final.orderBy('movieId')

In [18]:
df_final.show()

+-------+--------------------+----+------+---------+---------+--------+------+-----+-----------+-----+-------+---------+------+----+-------+-------+-------+------+--------+---+-------+
|movieId|               title|year|Action|Adventure|Animation|Children|Comedy|Crime|Documentary|Drama|Fantasy|Film-Noir|Horror|IMAX|Musical|Mystery|Romance|Sci-Fi|Thriller|War|Western|
+-------+--------------------+----+------+---------+---------+--------+------+-----+-----------+-----+-------+---------+------+----+-------+-------+-------+------+--------+---+-------+
|      1|           Toy Story|1995|    no|      yes|      yes|     yes|   yes|   no|         no|   no|    yes|       no|    no|  no|     no|     no|     no|    no|      no| no|     no|
|      2|             Jumanji|1995|    no|      yes|       no|     yes|    no|   no|         no|   no|    yes|       no|    no|  no|     no|     no|     no|    no|      no| no|     no|
|      3|    Grumpier Old Men|1995|    no|       no|       no|      no|   y

In [19]:
df_final.write.format('csv').option('header','True').mode('overwrite').save('/content/drive/MyDrive/Silver_Layer/movies_transformed.csv')

In [20]:
df2 = spark.read.format('csv')\
.option('header',True)\
.option("inferschema",True)\
.load('/content/drive/MyDrive/ml-latest/ratings.csv')

In [21]:
df2.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|      1|   4.0|1225734739|
|     1|    110|   4.0|1225865086|
|     1|    158|   4.0|1225733503|
|     1|    260|   4.5|1225735204|
|     1|    356|   5.0|1225735119|
|     1|    381|   3.5|1225734105|
|     1|    596|   4.0|1225733524|
|     1|   1036|   5.0|1225735626|
|     1|   1049|   3.0|1225734079|
|     1|   1066|   4.0|1225736961|
|     1|   1196|   3.5|1225735441|
|     1|   1200|   3.5|1225735861|
|     1|   1210|   4.5|1225735210|
|     1|   1214|   4.0|1225736426|
|     1|   1291|   5.0|1225734809|
|     1|   1293|   2.0|1225733842|
|     1|   1376|   3.0|1225733539|
|     1|   1396|   3.0|1225733534|
|     1|   1537|   4.0|1225736687|
|     1|   1909|   3.0|1225733717|
+------+-------+------+----------+
only showing top 20 rows



In [None]:
df2 = df2.withColumn("timestamp",F.from_unixtime("timestamp").cast("timestamp"))

In [35]:
df2 = df2.withColumn("date",F.date_format("timestamp","yyyy-mm-dd"))

In [36]:
df2 = df2.drop("year")

In [38]:
df_ratings = df2

In [None]:
df_ratings.write.format('csv').option('header','True').mode('overwrite').save('/content/drive/MyDrive/Silver_Layer/ratings_transformed.csv')