## SparkSession Object 

In [28]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('lin_reg').getOrCreate()

## Read the dataset 

In [29]:
df=spark.read.csv('./Data/movie_ratings_df.csv',inferSchema=True,header=True)

In [30]:
print((df.count(), len(df.columns)))


(100000, 3)


In [31]:
df.printSchema()


root
 |-- userId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- rating: integer (nullable = true)



In [35]:
df.groupBy('userId').count().orderBy('count',ascending=False).show(10,False) 


+------+-----+
|userId|count|
+------+-----+
|405   |737  |
|655   |685  |
|13    |636  |
|450   |540  |
|276   |518  |
|416   |493  |
|537   |490  |
|303   |484  |
|234   |480  |
|393   |448  |
+------+-----+
only showing top 10 rows



In [36]:
df.groupBy('userId').count().orderBy('count',ascending=True).show(10,False)

+------+-----+
|userId|count|
+------+-----+
|732   |20   |
|631   |20   |
|572   |20   |
|685   |20   |
|93    |20   |
|926   |20   |
|636   |20   |
|596   |20   |
|300   |20   |
|34    |20   |
+------+-----+
only showing top 10 rows



In [37]:
df.groupBy('title').count().orderBy('count',ascending=False).show(10,False)

+-----------------------------+-----+
|title                        |count|
+-----------------------------+-----+
|Star Wars (1977)             |583  |
|Contact (1997)               |509  |
|Fargo (1996)                 |508  |
|Return of the Jedi (1983)    |507  |
|Liar Liar (1997)             |485  |
|English Patient, The (1996)  |481  |
|Scream (1996)                |478  |
|Toy Story (1995)             |452  |
|Air Force One (1997)         |431  |
|Independence Day (ID4) (1996)|429  |
+-----------------------------+-----+
only showing top 10 rows



## Feature Engineering 

In [39]:
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer,IndexToString

stringIndexer = StringIndexer(inputCol="title",outputCol="title_new")
model = stringIndexer.fit(df)
indexed = model.transform(df)


In [41]:
indexed.show(25)

+------+------------+------+---------+
|userId|       title|rating|title_new|
+------+------------+------+---------+
|   196|Kolya (1996)|     3|    287.0|
|    63|Kolya (1996)|     3|    287.0|
|   226|Kolya (1996)|     5|    287.0|
|   154|Kolya (1996)|     3|    287.0|
|   306|Kolya (1996)|     5|    287.0|
|   296|Kolya (1996)|     4|    287.0|
|    34|Kolya (1996)|     5|    287.0|
|   271|Kolya (1996)|     4|    287.0|
|   201|Kolya (1996)|     4|    287.0|
|   209|Kolya (1996)|     4|    287.0|
|    35|Kolya (1996)|     2|    287.0|
|   354|Kolya (1996)|     5|    287.0|
|   199|Kolya (1996)|     5|    287.0|
|   113|Kolya (1996)|     2|    287.0|
|     1|Kolya (1996)|     5|    287.0|
|   173|Kolya (1996)|     5|    287.0|
|   360|Kolya (1996)|     4|    287.0|
|   234|Kolya (1996)|     4|    287.0|
|    14|Kolya (1996)|     4|    287.0|
|   309|Kolya (1996)|     4|    287.0|
|   331|Kolya (1996)|     4|    287.0|
|    21|Kolya (1996)|     3|    287.0|
|   111|Kolya (1996)|    

In [42]:
indexed.groupBy('title_new').count().orderBy('count',ascending=False).show(10,False)

+---------+-----+
|title_new|count|
+---------+-----+
|0.0      |583  |
|1.0      |509  |
|2.0      |508  |
|3.0      |507  |
|4.0      |485  |
|5.0      |481  |
|6.0      |478  |
|7.0      |452  |
|8.0      |431  |
|9.0      |429  |
+---------+-----+
only showing top 10 rows



## Splitting the Dataset

In [43]:
train,test=indexed.randomSplit([0.75,0.25])

In [44]:
 train.count()

75065

In [45]:
test.count()

24935

## Build and Train 