In [1]:
import pyspark
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

# DataFrame from data source

In [2]:
n = 'https://raw.githubusercontent.com/sehgalayush1/movie_revenue/master/ml_modules/data/BollywoodMovieDetail.csv'
f = 'BollywoodMovieDetail.csv'

sc.addFile(n)
movies = spark.read.option("header", True).option("inferSchema", "true").csv(pyspark.SparkFiles.get(f))

In [3]:
movies.printSchema()

root
 |-- imdbId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- releaseYear: integer (nullable = true)
 |-- releaseDate: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- writers: string (nullable = true)
 |-- actors: string (nullable = true)
 |-- directors: string (nullable = true)
 |-- sequel: string (nullable = true)
 |-- hitFlop: integer (nullable = true)



In [4]:
movies.show()

+---------+--------------------+-----------+-----------+--------------------+--------------------+--------------------+--------------------+------+-------+
|   imdbId|               title|releaseYear|releaseDate|               genre|             writers|              actors|           directors|sequel|hitFlop|
+---------+--------------------+-----------+-----------+--------------------+--------------------+--------------------+--------------------+------+-------+
|tt0118578|              Albela|       2001|20 Apr 2001|             Romance|Honey Irani (scre...|Govinda | Aishwar...|       Deepak Sareen|     0|      2|
|tt0169102|Lagaan: Once Upon...|       2001|08 May 2002|Adventure | Drama...|Ashutosh Gowarike...|Aamir Khan | Grac...|  Ashutosh Gowariker|     0|      6|
|tt0187279|Meri Biwi Ka Jawa...|       2004|02 Jul 2004|     Action | Comedy|                 N/A|Akshay Kumar | Sr...|Pankaj Parashar |...|     0|      1|
|tt0222024|Hum Tumhare Hain ...|       2002|24 May 2002|     Dra

# DataFrame from data source and samplingRatio

In [5]:
movies = spark.read.option("header", True).option("inferSchema", "true").option("samplingRatio", 0.01).csv(pyspark.SparkFiles.get(f))

In [7]:
movies.printSchema()

root
 |-- imdbId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- releaseYear: integer (nullable = true)
 |-- releaseDate: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- writers: string (nullable = true)
 |-- actors: string (nullable = true)
 |-- directors: string (nullable = true)
 |-- sequel: integer (nullable = true)
 |-- hitFlop: integer (nullable = true)



# DataFrame from data source and apply schema

In [8]:
ddl = 'imdbId STRING, title STRING, releaseYear DATE, releaseDate DATE, genre STRING, writers STRING, actors STRING, directors STRING, sequel INT, hitFlop INT'

In [10]:
movies = spark.read.option("header", True).option("inferSchema", "false").option("dateFormat","d MMM y").schema(ddl).csv(pyspark.SparkFiles.get(f))

In [11]:
movies.printSchema()

root
 |-- imdbId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- releaseYear: date (nullable = true)
 |-- releaseDate: date (nullable = true)
 |-- genre: string (nullable = true)
 |-- writers: string (nullable = true)
 |-- actors: string (nullable = true)
 |-- directors: string (nullable = true)
 |-- sequel: integer (nullable = true)
 |-- hitFlop: integer (nullable = true)



In [12]:
movies.show()

+---------+--------------------+-----------+-----------+--------------------+--------------------+--------------------+--------------------+------+-------+
|   imdbId|               title|releaseYear|releaseDate|               genre|             writers|              actors|           directors|sequel|hitFlop|
+---------+--------------------+-----------+-----------+--------------------+--------------------+--------------------+--------------------+------+-------+
|tt0118578|              Albela| 2001-01-01| 2001-04-20|             Romance|Honey Irani (scre...|Govinda | Aishwar...|       Deepak Sareen|     0|      2|
|tt0169102|Lagaan: Once Upon...| 2001-01-01| 2002-05-08|Adventure | Drama...|Ashutosh Gowarike...|Aamir Khan | Grac...|  Ashutosh Gowariker|     0|      6|
|tt0187279|Meri Biwi Ka Jawa...| 2004-01-01| 2004-07-02|     Action | Comedy|                 N/A|Akshay Kumar | Sr...|Pankaj Parashar |...|     0|      1|
|tt0222024|Hum Tumhare Hain ...| 2002-01-01| 2002-05-24|     Dra

# Write DataFrame and reread

In [13]:
movies.write.format('parquet').save('data/moviesFile')

In [15]:
movies1 = spark.read.parquet('data/moviesFile')

In [16]:
movies1.printSchema()

root
 |-- imdbId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- releaseYear: date (nullable = true)
 |-- releaseDate: date (nullable = true)
 |-- genre: string (nullable = true)
 |-- writers: string (nullable = true)
 |-- actors: string (nullable = true)
 |-- directors: string (nullable = true)
 |-- sequel: integer (nullable = true)
 |-- hitFlop: integer (nullable = true)



In [17]:
movies1.show()

+---------+--------------------+-----------+-----------+--------------------+--------------------+--------------------+--------------------+------+-------+
|   imdbId|               title|releaseYear|releaseDate|               genre|             writers|              actors|           directors|sequel|hitFlop|
+---------+--------------------+-----------+-----------+--------------------+--------------------+--------------------+--------------------+------+-------+
|tt0118578|              Albela| 2001-01-01| 2001-04-20|             Romance|Honey Irani (scre...|Govinda | Aishwar...|       Deepak Sareen|     0|      2|
|tt0169102|Lagaan: Once Upon...| 2001-01-01| 2002-05-08|Adventure | Drama...|Ashutosh Gowarike...|Aamir Khan | Grac...|  Ashutosh Gowariker|     0|      6|
|tt0187279|Meri Biwi Ka Jawa...| 2004-01-01| 2004-07-02|     Action | Comedy|                 N/A|Akshay Kumar | Sr...|Pankaj Parashar |...|     0|      1|
|tt0222024|Hum Tumhare Hain ...| 2002-01-01| 2002-05-24|     Dra