In [62]:
import findspark
findspark.init()
import pyspark

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [63]:
spark=SparkSession.builder.appName('Assignment_4_and_5').getOrCreate()

In [64]:
#READING DATASET 
data=(spark.read.option('header','true').csv('MoviesA3.csv', sep=";" , inferSchema=True)).persist()

In [65]:
data.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Length: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Actor: string (nullable = true)
 |-- Actress: string (nullable = true)
 |-- Director: string (nullable = true)
 |-- Popularity: integer (nullable = true)
 |-- Awards: string (nullable = true)
 |-- Image: string (nullable = true)



In [66]:
data.columns

['Year',
 'Length',
 'Title',
 'Genre',
 'Actor',
 'Actress',
 'Director',
 'Popularity',
 'Awards',
 'Image']

#  <font color='green'>Assignment 4 and 5 - Spark Data Frames and Clustering</font>

### Type of Each Attribute

In [67]:
data.dtypes

[('Year', 'int'),
 ('Length', 'int'),
 ('Title', 'string'),
 ('Genre', 'string'),
 ('Actor', 'string'),
 ('Actress', 'string'),
 ('Director', 'string'),
 ('Popularity', 'int'),
 ('Awards', 'string'),
 ('Image', 'string')]

### Nominal data include country, gender, race, hair color etc. of a group of people.
### Ordinal data include having a position in class as “First” or “Second”.
### An interval scale is one where there is order and the difference between two values is meaningful. Examples of interval variables include: temperature (Farenheit), temperature (Celcius), pH, SAT score (200-800), credit score (300-850).
### A ratio variable, has all the properties of an interval variable, and also has a clear definition of 0.0. When the variable equals 0.0, there is none of that variable. Examples of ratio variables include: enzyme activity, dose amount, reaction rate, flow rate, concentration, pulse, weight, length, temperature in Kelvin (0.0 Kelvin really does mean “no heat”), survival time.

| Attribute | Type | Reason |
| ----------|------|--------------------------------------------------------------------------------------------------------- |
| Year | Interval  | Since there is a difference between two values is meaningful and hence this attribute lies in the Interval Type and can never be 0 |
| Length | Ordinal | Since it is a measurable attribute and hence this attribute lies in the Ordinal Type |
| Title | Nominal  | Since the attribute is not ordered and it lies in the category of a group of something and hence this attribute lies in the Nominal Type |
| Genre | Nominal  | Since the attribute is not ordered and it lies in the category of a group of something and hence this attribute lies in the Nominal Type |
| Actor | Nominal  | Since the attribute is not ordered and it lies in the category of a group of something and hence this attribute lies in the Nominal Type |
| Actress | Nominal  | Since the attribute is not ordered and it lies in the category of a group of something and hence this attribute lies in the Nominal Type |
| Director | Nominal  | Since the attribute is not ordered and it lies in the category of a group of something and hence this attribute lies in the Nominal Type |
| Popularity | Ratio  | Since the attribute has all the properties of an interval variable, and also has a clear definition of 0.0 and hence this attribute lies in the Ratio Type |
| Awards | Nominal  | Since the attribute is not ordered and it lies in the category of a group of something and hence this attribute lies in the Nominal Type |
Image | Nominal  | Since the attribute is not ordered and it lies in the category of a group of something and hence this attribute lies in the Nominal Type |

### Pre-Processing

In [68]:
data.filter(data.Year.isNull()).show()
data.filter(data.Year.isNull()).count()

+----+------+-----+-----+-----+-------+--------+----------+------+-----+
|Year|Length|Title|Genre|Actor|Actress|Director|Popularity|Awards|Image|
+----+------+-----+-----+-----+-------+--------+----------+------+-----+
+----+------+-----+-----+-----+-------+--------+----------+------+-----+



0

In [69]:
data.filter(data.Length.isNull()).show()
data.filter(data.Length.isNull()).count()

+----+------+--------------------+-------+--------------------+--------------------+--------------------+----------+------+-------------------+
|Year|Length|               Title|  Genre|               Actor|             Actress|            Director|Popularity|Awards|              Image|
+----+------+--------------------+-------+--------------------+--------------------+--------------------+----------+------+-------------------+
|1980|  null|Happy Birthday to Me| Horror|         Ford, Glenn|Anderson, Melissa...|    Thompson, J. Lee|        88|    No|    glennFord.png,,|
|1962|  null|              Dr. No| Action|       Connery, Sean|     Andress, Ursula|      Young, Terence|         7|    No|  seanConnery.png,,|
|1989|  null|             Killjoy|Mystery|        Culp, Robert|       Basinger, Kim|Moxey, John Llewe...|        71|    No| NicholasCage.png,,|
|1982|  null|Catch a Rising St...| Comedy|     Belzer, Richard|        Benatar, Pat|                null|        18|    No| NicholasCage

67

In [70]:
data.filter(data.Genre.isNull()).show()
data.filter(data.Genre.isNull()).count()

+----+------+---------------+-----+-----------------+----------------+----------------+----------+------+------------------+
|Year|Length|          Title|Genre|            Actor|         Actress|        Director|Popularity|Awards|             Image|
+----+------+---------------+-----+-----------------+----------------+----------------+----------+------+------------------+
|1953|    61|White Lightning| null|Clements, Stanley|Blondell, Gloria|  Bernds, Edward|      null|    No|NicholasCage.png,,|
|1980|   180|     Wild Times| null|     Elliott, Sam|   Peyser, Penny|Compton, Richard|        75|    No|NicholasCage.png,,|
+----+------+---------------+-----+-----------------+----------------+----------------+----------+------+------------------+



2

In [71]:
data.filter(data.Actor.isNull()).show()
data.filter(data.Actor.isNull()).count()

+----+------+--------------------+------+-----+-------+----------------+----------+------+--------------------+
|Year|Length|               Title| Genre|Actor|Actress|        Director|Popularity|Awards|               Image|
+----+------+--------------------+------+-----+-------+----------------+----------+------+--------------------+
|1988|   100|Ciao Italia, Mado...| Music| null|Madonna|De Winter, Harry|        74|    No| NicholasCage.png,,,|
|1991|   118|Madonna, Truth or...| Music| null|Madonna|Keshishian, Alek|        54|    No| NicholasCage.png,,,|
|1990|    60|Immaculate Collec...| Music| null|Madonna|            null|        32|    No|NicholasCage.png,,,,|
|1987|    50|Madonna Live, The...| Music| null|Madonna|            null|        75|    No|NicholasCage.png,,,,|
|1990|     5|Madonna, Justify ...| Music| null|Madonna|            null|        77|    No|NicholasCage.png,,,,|
|1991|    16|Madonna, Like a V...| Music| null|Madonna|            null|        63|    No|NicholasCage.p

8

In [72]:
data.filter(data.Actress.isNull()).show()
data.filter(data.Actress.isNull()).count()

+----+------+--------------------+-------+--------------------+-------+--------------------+----------+------+--------------------+
|Year|Length|               Title|  Genre|               Actor|Actress|            Director|Popularity|Awards|               Image|
+----+------+--------------------+-------+--------------------+-------+--------------------+----------+------+--------------------+
|1953|    94|           Vera Cruz| Action|        Cooper, Gary|   null|     Aldrich, Robert|        71|    No| NicholasCage.png,,,|
|1954|    91|              Apache|Western|     Lancaster, Burt|   null|     Aldrich, Robert|        78|    No|burtLancaster.png,,,|
|1977|   146|Twilight's Last G...|  Drama|     Lancaster, Burt|   null|     Aldrich, Robert|        84|    No|burtLancaster.png,,,|
|1979|   119|     Frisco Kid, The| Comedy|        Wilder, Gene|   null|     Aldrich, Robert|        10|    No|  NicholasCage.png,,|
|1954|    30|   Bank on the Stars|  Drama|          Paar, Jack|   null|     

378

In [73]:
data.filter(data.Director.isNull()).show()
data.filter(data.Director.isNull()).count()

+----+------+--------------------+-------+-------------------+----------------+--------+----------+------+--------------------+
|Year|Length|               Title|  Genre|              Actor|         Actress|Director|Popularity|Awards|               Image|
+----+------+--------------------+-------+-------------------+----------------+--------+----------+------+--------------------+
|1992|    90|           Germicide|  Drama|        Taylor, Rod| Andersson, Bibi|    null|        36|    No| NicholasCage.png,,,|
|1954|   103|       Elephant Walk|  Drama|       Finch, Peter|   Andrews, Dana|    null|        11|    No| NicholasCage.png,,,|
|1985|    55|Gonzo Presents Mu...| Comedy|       Cleese, John|  Andrews, Julie|    null|        88|    No| NicholasCage.png,,,|
|1991|   115|              Bataan|    War|     Taylor, Robert|     Arnaz, Desi|    null|        68|    No| NicholasCage.png,,,|
|1991|    90|Henry, Portrait o...| Horror|    Rooker, Michael|   Arnold, Tracy|    null|        69|    N

253

In [74]:
data.filter(data.Popularity.isNull()).show()
data.filter(data.Popularity.isNull()).count()

+----+------+--------------------+-------+--------------------+----------------+-------------------+----------+------+-------------------+
|Year|Length|               Title|  Genre|               Actor|         Actress|           Director|Popularity|Awards|              Image|
+----+------+--------------------+-------+--------------------+----------------+-------------------+----------+------+-------------------+
|1953|    61|     White Lightning|   null|   Clements, Stanley|Blondell, Gloria|     Bernds, Edward|      null|    No| NicholasCage.png,,|
|1986|    90|  Knights & Emeralds|  Drama|    Leadbitter, Bill|  Hills, Beverly|          Emes, Ian|      null|    No| NicholasCage.png,,|
|1927|    62|      Drop Kick, The|  Drama|Barthelmess, Richard|   Kent, Barbara|      Webb, Millard|      null|    No|  NicholasCage.png,|
|1923|    57|        Desert Rider|Western|         Hoxie, Jack|  Nelson, Evelyn|Bradbury, Robert N.|      null|    No| NicholasCage.png,,|
|1954|    30|   Bank on the

6

In [75]:
data.filter(data.Awards.isNull()).show()
data.filter(data.Awards.isNull()).count()

+----+------+-----+-----+-----+-------+--------+----------+------+-----+
|Year|Length|Title|Genre|Actor|Actress|Director|Popularity|Awards|Image|
+----+------+-----+-----+-----+-------+--------+----------+------+-----+
+----+------+-----+-----+-----+-------+--------+----------+------+-----+



0

In [76]:
data.filter(data.Image.isNull()).show()
data.filter(data.Image.isNull()).count()

+----+------+-----+-----+-----+-------+--------+----------+------+-----+
|Year|Length|Title|Genre|Actor|Actress|Director|Popularity|Awards|Image|
+----+------+-----+-----+-----+-------+--------+----------+------+-----+
+----+------+-----+-----+-----+-------+--------+----------+------+-----+



0

## After checking for NULL values in the dataframe we have found out that "Length" has 67 null values, "Genre" has 2 null values, "Actor" has 8 null values, "Actress" has 378 Null values, "Director" has 253 Null values and "Popularity" has 6 null values, the rest has no null values.
## Now I will impute those null values using different techniques

1. Length 
#### Since the movies with the same genre has almost the same length on average, so I have decided to take the mean of length of each Genre and impute the values for the Length Attribute for the Genre it has

In [77]:
# First step is to check the Genre we need to calculate the mean of the movies length for
data.select('Genre').filter(data.Length.isNull()).distinct().show()

+--------+
|   Genre|
+--------+
|   Drama|
|     War|
| Mystery|
|   Music|
|  Horror|
| Western|
|  Comedy|
|  Action|
|Westerns|
+--------+



In [78]:
data.filter(data.Length.isNull()).count()

67

In [79]:
from pyspark.sql.functions import mean as _mean, col

drama = data.select(data.Length).filter(data.Genre == 'Drama')

df_stats = drama.select(
    _mean(col('Length')).alias('mean')
).collect()

mean_drama = df_stats[0]['mean']
mean_drama

113.30455259026688

In [80]:
war = data.select(data.Length).filter(data.Genre == 'War')

df_stats = war.select(
    _mean(col('Length')).alias('mean')
).collect()

mean_war = df_stats[0]['mean']
mean_war

116.90625

In [81]:
mystery = data.select(data.Length).filter(data.Genre == 'Mystery')

df_stats = mystery.select(
    _mean(col('Length')).alias('mean')
).collect()

mean_mystery = df_stats[0]['mean']
mean_mystery

103.00990099009901

In [82]:
music = data.select(data.Length).filter(data.Genre == 'Music')

df_stats = music.select(
    _mean(col('Length')).alias('mean')
).collect()

mean_music = df_stats[0]['mean']
mean_music

100.48780487804878

In [83]:
horror = data.select(data.Length).filter(data.Genre == 'Horror')

df_stats = horror.select(
    _mean(col('Length')).alias('mean')
).collect()

mean_horror = df_stats[0]['mean']
mean_horror

93.92727272727272

In [84]:
western = data.select(data.Length).filter(data.Genre == 'Western')

df_stats = western.select(
    _mean(col('Length')).alias('mean')
).collect()

mean_western = df_stats[0]['mean']
mean_western

93.0091743119266

In [85]:
comedy = data.select(data.Length).filter(data.Genre == 'Comedy')

df_stats = comedy.select(
    _mean(col('Length')).alias('mean')
).collect()

mean_comedy = df_stats[0]['mean']
mean_comedy

96.50540540540541

In [86]:
action = data.select(data.Length).filter(data.Genre == 'Action')

df_stats = action.select(
    _mean(col('Length')).alias('mean')
).collect()

mean_action = df_stats[0]['mean']
mean_action

104.5

In [87]:
westerns = data.select(data.Length).filter(data.Genre == 'Westerns')

df_stats = westerns.select(
    _mean(col('Length')).alias('mean')
).collect()

mean_westerns = df_stats[0]['mean']
mean_westerns

124.8

In [88]:
mean_drama

113.30455259026688

In [89]:
data.filter(data.Length.isNull()).filter(data.Genre == 'Drama').na.fill(value = mean_drama,subset=["Length"]).show()

+----+------+--------------------+-----+--------------------+-----------------+-------------------+----------+------+--------------------+
|Year|Length|               Title|Genre|               Actor|          Actress|           Director|Popularity|Awards|               Image|
+----+------+--------------------+-----+--------------------+-----------------+-------------------+----------+------+--------------------+
|1989|   113|         Good Fellas|Drama|     De Niro, Robert| Bracco, Lorraine|   Scorsese, Martin|        15|    No|  NicholasCage.png,,|
|1943|   113|     A Guy Named Joe|Drama|      Tracy, Spencer|     Dunne, Irene|    Fleming, Victor|        42|    No|  spencerTracy.png,,|
|1991|   113|         Coming Home|Drama|         Voight, Jon|      Fonda, Jane|               null|         1|   Yes| NicholasCage.png,,,|
|1988|   113|        Five Corners|Drama|        Robbins, Tim|    Foster, Jodie|               null|        88|    No| NicholasCage.png,,,|
|1955|   113|Blackboard Jun

In [90]:
mean_war

116.90625

In [91]:
data.filter(data.Length.isNull()).filter(data.Genre == 'War').na.fill(value = mean_war,subset=["Length"]).show()

+----+------+--------------------+-----+-----------+-------+--------+----------+------+-----------------+
|Year|Length|               Title|Genre|      Actor|Actress|Director|Popularity|Awards|            Image|
+----+------+--------------------+-----+-----------+-------+--------+----------+------+-----------------+
|1991|   116|John Wayne Collec...|  War|Wayne, John|   null|    null|        49|    No|  johnWayne.png,,|
|1992|   116|John Wayne Collec...|  War|Wayne, John|   null|    null|         3|    No|johnWayne.png,,,,|
+----+------+--------------------+-----+-----------+-------+--------+----------+------+-----------------+



In [92]:
mean_mystery

103.00990099009901

In [93]:
data.filter(data.Length.isNull()).filter(data.Genre == 'Mystery').na.fill(value = mean_mystery,subset=["Length"]).show()

+----+------+------------------+-------+---------------+---------------+--------------------+----------+------+-------------------+
|Year|Length|             Title|  Genre|          Actor|        Actress|            Director|Popularity|Awards|              Image|
+----+------+------------------+-------+---------------+---------------+--------------------+----------+------+-------------------+
|1989|   103|           Killjoy|Mystery|   Culp, Robert|  Basinger, Kim|Moxey, John Llewe...|        71|    No| NicholasCage.png,,|
|1988|   103|  White of the Eye|Mystery|   Keith, David|Moriarty, Cathy|     Cammell, Donald|        48|    No| NicholasCage.png,,|
|1988|   103|Still of the Night|Mystery|  Scheider, Roy|  Streep, Meryl|      Benton, Robert|        42|    No|  merylStreep.png,,|
|1992|   103| Mummy's Hand, The|Mystery|    Foran, Dick|           null|                null|        54|    No|NicholasCage.png,,,|
|1989|   103|        Saint, The|Mystery|   Moore, Roger|           null|    

In [94]:
mean_music

100.48780487804878

In [95]:
data.filter(data.Length.isNull()).filter(data.Genre == 'Music').na.fill(value = mean_music,subset=["Length"]).show()

+----+------+--------------------+-----+----------+-------------+--------+----------+------+-------------------+
|Year|Length|               Title|Genre|     Actor|      Actress|Director|Popularity|Awards|              Image|
+----+------+--------------------+-----+----------+-------------+--------+----------+------+-------------------+
|1984|   100|Gary Numan - Berz...|Music|Webb, John|Taylor, Karen|    null|        60|    No|NicholasCage.png,,,|
|1990|   100|Sounds of the Sev...|Music|Jones, Tom|Welch, Raquel|    null|        13|    No|NicholasCage.png,,,|
+----+------+--------------------+-----+----------+-------------+--------+----------+------+-------------------+



In [96]:
mean_horror

93.92727272727272

In [97]:
data.filter(data.Length.isNull()).filter(data.Genre == 'Horror').na.fill(value = mean_horror,subset=["Length"]).show()

+----+------+--------------------+------+--------------+--------------------+----------------+----------+------+-------------------+
|Year|Length|               Title| Genre|         Actor|             Actress|        Director|Popularity|Awards|              Image|
+----+------+--------------------+------+--------------+--------------------+----------------+----------+------+-------------------+
|1980|    93|Happy Birthday to Me|Horror|   Ford, Glenn|Anderson, Melissa...|Thompson, J. Lee|        88|    No|    glennFord.png,,|
|1988|    93|  Mama's Dirty Girls|Horror|Currie, Sondra|     Grahame, Gloria|            null|        62|    No|NicholasCage.png,,,|
+----+------+--------------------+------+--------------+--------------------+----------------+----------+------+-------------------+



In [98]:
mean_western

93.0091743119266

In [99]:
data.filter(data.Length.isNull()).filter(data.Genre == 'Western').na.fill(value = mean_western,subset=["Length"]).show()

+----+------+--------------------+-------+--------------+--------------+--------------+----------+------+-------------------+
|Year|Length|               Title|  Genre|         Actor|       Actress|      Director|Popularity|Awards|              Image|
+----+------+--------------------+-------+--------------+--------------+--------------+----------+------+-------------------+
|1931|    93|     Range Feud, The|Western|   Wayne, John|Fleming, Susan|Lederman, Ross|        51|    No|     johnWayne.png,|
|1989|    93|Death Valley Days...|Western|   Caan, James|          null|          null|         9|    No|NicholasCage.png,,,|
|1989|    93|Death Valley Days...|Western|Reagan, Ronald|          null|          null|         1|    No|NicholasCage.png,,,|
|1993|    93|Duke, The Films o...|Western|   Wayne, John|          null|          null|        70|    No|   johnWayne.png,,,|
|1932|    93|   Hurricane Express|Western|   Wayne, John|          null|          null|         7|    No|  johnWayne.p

In [100]:
mean_comedy

96.50540540540541

In [101]:
data.filter(data.Length.isNull()).filter(data.Genre == 'Comedy').na.fill(value = mean_comedy,subset=["Length"]).show()

+----+------+--------------------+------+--------------------+-----------------+------------------+----------+------+--------------------+
|Year|Length|               Title| Genre|               Actor|          Actress|          Director|Popularity|Awards|               Image|
+----+------+--------------------+------+--------------------+-----------------+------------------+----------+------+--------------------+
|1982|    96|Catch a Rising St...|Comedy|     Belzer, Richard|     Benatar, Pat|              null|        18|    No|  NicholasCage.png,,|
|1987|    96|           Mannequin|Comedy|    McCarthy, Andrew|    Cattrall, Kim| Gottlieb, Michael|        23|    No|  NicholasCage.png,,|
|1990|    96|          Party, The|Comedy|      Sellers, Peter|  Champion, Marge|    Edwards, Blake|        32|    No|   NicholasCage.png,|
|1991|    96|             Why Me?|Comedy|Lambert, Christopher|      Greist, Kim|              null|        74|    No| NicholasCage.png,,,|
|1990|    96|        Crazy 

In [102]:
mean_action

104.5

In [103]:
data.filter(data.Length.isNull()).filter(data.Genre == 'Action').na.fill(value = mean_action,subset=["Length"]).show()

+----+------+--------------------+------+-----------------+-------------------+--------------------+----------+------+--------------------+
|Year|Length|               Title| Genre|            Actor|            Actress|            Director|Popularity|Awards|               Image|
+----+------+--------------------+------+-----------------+-------------------+--------------------+----------+------+--------------------+
|1962|   104|              Dr. No|Action|    Connery, Sean|    Andress, Ursula|      Young, Terence|         7|    No|   seanConnery.png,,|
|1983|   104|Moon in the Gutte...|Action|Depardieu, G�rard|  Kinski, Nastassia|Beineix, Jean-Jac...|        29|    No|   NicholasCage.png,|
|1992|   104|     Pleasure Palace|Action|     Sharif, Omar|Principal, Victoria|                null|        45|    No| NicholasCage.png,,,|
|1990|   104|Hells Angels on W...|Action|  Nicholson, Jack|    Scharf, Sabrina|                null|         1|    No| NicholasCage.png,,,|
|1990|   104|      L

In [104]:
mean_westerns

124.8

In [105]:
data.filter(data.Length.isNull()).filter(data.Genre == 'Westerns').na.fill(value = mean_westerns,subset=["Length"]).show()

+----+------+--------------------+--------+---------------+-------+--------+----------+------+--------------------+
|Year|Length|               Title|   Genre|          Actor|Actress|Director|Popularity|Awards|               Image|
+----+------+--------------------+--------+---------------+-------+--------+----------+------+--------------------+
|1991|   124|Clint Eastwood Co...|Westerns|Eastwood, Clint|   null|    null|        11|    No|clintEastwood.png,,,|
+----+------+--------------------+--------+---------------+-------+--------+----------+------+--------------------+



2. Genre 
#### Genre has 2 missing values here I will check the Director and mode of the movies he/she directs and put it there

In [106]:
# First step is to check the Director we need to calculate the mode of the movies Genre for
data.select('Director').filter(data.Genre.isNull()).distinct().show()

+----------------+
|        Director|
+----------------+
|Compton, Richard|
|  Bernds, Edward|
+----------------+



In [124]:
from pyspark.sql.functions import col

cnts = data.filter(data.Director == 'Compton, Richard').groupBy("Genre").count()
cnts.show()

data.filter(data.Director == 'Compton, Richard').show()
cnts = data.filter(data.Director == 'Bernds, Edward').groupBy("Genre").count()
cnts.show()
data.filter(data.Director == 'Bernds, Edward').show()

#Since we only had one value here we can't use Director hence I have decided to drop these two values as only two values out of 1659 would not affect the dataset 

+-----+-----+
|Genre|count|
+-----+-----+
| null|    1|
+-----+-----+

+----+------+----------+-----+------------+-------------+----------------+----------+------+------------------+
|Year|Length|     Title|Genre|       Actor|      Actress|        Director|Popularity|Awards|             Image|
+----+------+----------+-----+------------+-------------+----------------+----------+------+------------------+
|1980|   180|Wild Times| null|Elliott, Sam|Peyser, Penny|Compton, Richard|        75|    No|NicholasCage.png,,|
+----+------+----------+-----+------------+-------------+----------------+----------+------+------------------+

+-----+-----+
|Genre|count|
+-----+-----+
| null|    1|
+-----+-----+

+----+------+---------------+-----+-----------------+----------------+--------------+----------+------+------------------+
|Year|Length|          Title|Genre|            Actor|         Actress|      Director|Popularity|Awards|             Image|
+----+------+---------------+-----+----------------

In [128]:
# Dropping Null Values
print(data.na.drop(subset=["Genre"]).count())
data.na.drop(subset=["Genre"]).filter(data.Genre.isNull()).count()

1657


0

3. Actor
#### Actor has 8 missing values here I will check the Genre where the Actor is same and for other values and put it in there

In [155]:
# First step is to check the Director we need to calculate the mode of the movies Genre for
data.filter(data.Actor.isNull()).show()
# Since we have Null values for the Actors working in Music Genre so I am trying to get the mode of the Actor who worked in Music Genre
data.select('Actor').filter(data.Genre == 'Music').distinct().show()
music_actors=data.select('*').filter(data.Genre == 'Music').groupBy('Actor').count().sort((col('count')).desc())
music_actors.show()

+----+------+--------------------+------+-----+-------+----------------+----------+------+--------------------+
|Year|Length|               Title| Genre|Actor|Actress|        Director|Popularity|Awards|               Image|
+----+------+--------------------+------+-----+-------+----------------+----------+------+--------------------+
|1988|   100|Ciao Italia, Mado...| Music| null|Madonna|De Winter, Harry|        74|    No| NicholasCage.png,,,|
|1991|   118|Madonna, Truth or...| Music| null|Madonna|Keshishian, Alek|        54|    No| NicholasCage.png,,,|
|1990|    60|Immaculate Collec...| Music| null|Madonna|            null|        32|    No|NicholasCage.png,,,,|
|1987|    50|Madonna Live, The...| Music| null|Madonna|            null|        75|    No|NicholasCage.png,,,,|
|1990|     5|Madonna, Justify ...| Music| null|Madonna|            null|        77|    No|NicholasCage.png,,,,|
|1991|    16|Madonna, Like a V...| Music| null|Madonna|            null|        63|    No|NicholasCage.p

In [158]:
# Taking mode of Actor with most movies and adding it to the null values
actors=data.groupBy('Actor').count().sort((col('count')).desc())
actors.show()

maximum_val=actors.agg({"count": "max"}).first()[0]

actor_with_most_movies=actors.select('Actor').filter((col('count')==maximum_val)).collect()[0][0] # Mode

data.filter(data.Actor.isNull()).na.fill(value = actor_with_most_movies,subset=["Actor"]).show()

+-----------------+-----+
|            Actor|count|
+-----------------+-----+
|      Wayne, John|   81|
|  Eastwood, Clint|   31|
|    Connery, Sean|   30|
|     Newman, Paul|   27|
|  Lancaster, Burt|   24|
|   Brando, Marlon|   24|
|   Tracy, Spencer|   24|
|      Ford, Glenn|   23|
|   Sellers, Peter|   23|
|Depardieu, G�rard|   18|
|  Nicholson, Jack|   17|
|Dreyfuss, Richard|   15|
|     Moore, Roger|   15|
| Douglas, Michael|   15|
|    Moore, Dudley|   13|
|  De Niro, Robert|   13|
| Hopkins, Anthony|   12|
|     Cleese, John|   12|
|     Allen, Woody|   11|
|  Williams, Robin|   11|
+-----------------+-----+
only showing top 20 rows

+----+------+--------------------+------+-----------+-------+----------------+----------+------+--------------------+
|Year|Length|               Title| Genre|      Actor|Actress|        Director|Popularity|Awards|               Image|
+----+------+--------------------+------+-----------+-------+----------------+----------+------+------------------

4. Actress
#### Actress has 378 missing values here I will check with the same technique as used for Actor and then see what can we do to impute the values first

In [161]:
# First step is to check the Director we need to calculate the mode of the movies Genre for
data.filter(data.Actress.isNull()).show()
# Since we have Null values for the Actors working in Music Genre so I am trying to get the mode of the Actor who worked in Music Genre
data.select('Genre').filter(data.Actress.isNull()).distinct().show()

+----+------+--------------------+-------+--------------------+-------+--------------------+----------+------+--------------------+
|Year|Length|               Title|  Genre|               Actor|Actress|            Director|Popularity|Awards|               Image|
+----+------+--------------------+-------+--------------------+-------+--------------------+----------+------+--------------------+
|1953|    94|           Vera Cruz| Action|        Cooper, Gary|   null|     Aldrich, Robert|        71|    No| NicholasCage.png,,,|
|1954|    91|              Apache|Western|     Lancaster, Burt|   null|     Aldrich, Robert|        78|    No|burtLancaster.png,,,|
|1977|   146|Twilight's Last G...|  Drama|     Lancaster, Burt|   null|     Aldrich, Robert|        84|    No|burtLancaster.png,,,|
|1979|   119|     Frisco Kid, The| Comedy|        Wilder, Gene|   null|     Aldrich, Robert|        10|    No|  NicholasCage.png,,|
|1954|    30|   Bank on the Stars|  Drama|          Paar, Jack|   null|     

In [165]:
drama_actresses=data.select('*').filter((data.Genre == 'Drama') & (data.Actress.isNotNull())).groupBy('Actress').count().sort((col('count')).desc())
drama_actresses.show()
war_actresses=data.select('*').filter((data.Genre == 'War') & (data.Actress.isNotNull())).groupBy('Actress').count().sort((col('count')).desc())
war_actresses.show()
mystery_actresses=data.select('*').filter((data.Genre == 'Mystery') & (data.Actress.isNotNull())).groupBy('Actress').count().sort((col('count')).desc())
mystery_actresses.show()
music_actresses=data.select('*').filter((data.Genre == 'Music') & (data.Actress.isNotNull())).groupBy('Actress').count().sort((col('count')).desc())
music_actresses.show()
sci_fi_actresses=data.select('*').filter((data.Genre == 'Science Fiction') & (data.Actress.isNotNull())).groupBy('Actress').count().sort((col('count')).desc())
sci_fi_actresses.show()
horror_actresses=data.select('*').filter((data.Genre == 'Horror') & (data.Actress.isNotNull())).groupBy('Actress').count().sort((col('count')).desc())
horror_actresses.show()
western_actresses=data.select('*').filter((data.Genre == 'Western') & (data.Actress.isNotNull())).groupBy('Actress').count().sort((col('count')).desc())
western_actresses.show()
comedy_actresses=data.select('*').filter((data.Genre == 'Comedy') & (data.Actress.isNotNull())).groupBy('Actress').count().sort((col('count')).desc())
comedy_actresses.show()
action_actresses=data.select('*').filter((data.Genre == 'Action') & (data.Actress.isNotNull())).groupBy('Actress').count().sort((col('count')).desc())
action_actresses.show()
westerns_actresses=data.select('*').filter((data.Genre == 'Westerns') & (data.Actress.isNotNull())).groupBy('Actress').count().sort((col('count')).desc())
westerns_actresses.show()

+------------------+-----+
|           Actress|count|
+------------------+-----+
|   Bergman, Ingrid|   23|
|      Garbo, Greta|   17|
| Taylor, Elizabeth|   16|
|     Loren, Sophia|   13|
|     Streep, Meryl|   12|
|Hepburn, Katharine|    7|
|  Woodward, Joanne|    6|
|      Field, Sally|    6|
|     Dunaway, Faye|    6|
|     Hannah, Daryl|    5|
|  Bardot, Brigitte|    5|
|   Andersson, Bibi|    5|
|       Ullman, Liv|    5|
|    Hayworth, Rita|    5|
|   McGillis, Kelly|    5|
|     Keaton, Diane|    4|
|      Hawn, Goldie|    4|
|      Aimee, Anouk|    4|
|   O'Hara, Maureen|    4|
|     Basinger, Kim|    4|
+------------------+-----+
only showing top 20 rows

+------------------+-----+
|           Actress|count|
+------------------+-----+
|    Neal, Patricia|    1|
|   Calvet, Corinne|    1|
|   Brewster, Diane|    1|
|     Mason, Marsha|    1|
|  Bardot, Brigitte|    1|
|   Kumagai, Denice|    1|
|        Bow, Clara|    1|
|    Bacall, Lauren|    1|
|       Arnaz, Desi|    1|
| 

In [None]:
# Taking mode of Actor with most movies and adding it to the null values
actors=data.groupBy('Actress').count().sort((col('count')).desc())
actors.show()

maximum_val=actors.agg({"count": "max"}).first()[0]

actor_with_most_movies=actors.select('Actor').filter((col('count')==maximum_val)).collect()[0][0] # Mode

data.filter(data.Actor.isNull()).na.fill(value = actor_with_most_movies,subset=["Actor"]).show()

### Question # 5: Find the average length of the movies of each genre.

In [11]:
data.groupBy(data.Genre).avg('Length').alias('Average Length').show(10) # Can't use Avg sum or any other function before group by just like in sql writing this down to remember next time

+---------------+------------------+
|          Genre|       avg(Length)|
+---------------+------------------+
|          Crime|              66.0|
|        Romance|             127.0|
|      Adventure|             119.0|
|           null|             120.5|
|          Drama|113.30455259026688|
|            War|         116.90625|
|        Fantasy|             102.0|
|        Mystery|103.00990099009901|
|          Music|100.48780487804878|
|Science Fiction|106.47368421052632|
+---------------+------------------+
only showing top 10 rows



### Question # 6: Find the actor and actress pair who have acted in more than three Comedies together.

In [12]:
from pyspark.sql.functions import col
data.filter(data.Genre=='Comedy').groupBy('Actor','Actress').count().alias('Count').filter((col('count')>3) & (data.Actor.isNotNull() & data.Actress.isNotNull())).show()

+--------------+------------------+-----+
|         Actor|           Actress|count|
+--------------+------------------+-----+
|  Allen, Woody|     Keaton, Diane|    5|
|Tracy, Spencer|Hepburn, Katharine|    6|
+--------------+------------------+-----+



### Question # 7: Find the names of actors who acted in movies of both ‘Comedy’ and ‘Drama’ Genre.

In [13]:
data.select('Actor').filter(data.Genre=='Comedy').distinct().intersect(data.select('Actor').filter(data.Genre=='Drama').distinct()).show()

+--------------------+
|               Actor|
+--------------------+
|      Boyer, Charles|
|      Taylor, Robert|
|        Lemmon, Jack|
|       Aiello, Danny|
|      Stewart, James|
|    Malmsten, Birger|
|         Taylor, Rod|
|       Russell, Kurt|
|     Hopkins, Harold|
|      Beatty, Warren|
|       Irons, Jeremy|
|        Fonda, Henry|
|      Ford, Harrison|
|   Depardieu, G�rard|
|     Eastwood, Clint|
|      Howard, Arliss|
|        Cleese, John|
|Lambert, Christopher|
|        Johnson, Ben|
|       Connery, Sean|
+--------------------+
only showing top 20 rows



### Question # 8: Find the names of actors who acted in movies of both ‘Comedy’ or ‘Drama’ Genre.

In [14]:
data.select('Actor').filter(data.Genre=='Comedy').distinct().union(data.select('Actor').filter(data.Genre=='Drama').distinct()).show()

+-----------------+
|            Actor|
+-----------------+
|  Matthau, Walter|
|Keillor, Garrison|
|Gauthier, Vincent|
|   Boyer, Charles|
|Finneran, Siohban|
|     Keach, James|
|Nagase, Masatoshi|
|   Taylor, Robert|
|    Mandel, Howie|
|     Chase, Chevy|
|   Wilson, George|
|      Arkin, Alan|
|     Lemmon, Jack|
|Banderas, Antonio|
|     Most, Donald|
|    Jaglom, Henry|
|    Rey, Fernando|
|    Aiello, Danny|
|     Reiner, Carl|
|     Cooper, Gary|
+-----------------+
only showing top 20 rows



### Question # 9: Find the names of actors who did not acted in any ‘Comedy’.

In [15]:
data.select('Actor').filter(data.Genre!='Comedy').distinct().show()

+--------------------+
|               Actor|
+--------------------+
|      Din, Ayub Khan|
|        Ferrer, Jose|
|       Harrison, Rex|
|      McCleery, Gary|
|         Busey, Gary|
|      Boyer, Charles|
|       Kime, Jeffrey|
|     McDowall, Roddy|
|          Race, Hugo|
|Luckinbill, Laurence|
|      Taylor, Robert|
|       Karyo, Tcheky|
|          Webb, John|
|         Ganus, Paul|
|       Longden, John|
|       Fisher, Eddie|
|        Keach, Stacy|
|   Bergen, Robert D.|
|    Sj�str�m, Victor|
|         Morrow, Vic|
+--------------------+
only showing top 20 rows



### Question # 10: Find each actor, find the mean, max, min ranking of his movies.

In [16]:
from pyspark.sql.functions import mean as _mean, col
from pyspark.sql import functions as f
data.groupBy("Actor").agg(_mean("Popularity") ,f.max("Popularity") ,f.min("Popularity")).filter(data.Actor.isNotNull()).orderBy((data.Actor).asc()).show()

+--------------------+-----------------+---------------+---------------+
|               Actor|  avg(Popularity)|max(Popularity)|min(Popularity)|
+--------------------+-----------------+---------------+---------------+
|        Abel, Alfred|             49.0|             49|             49|
|  Abraham, F. Murray|              6.0|              6|              6|
|    Adolphson, Edvin|             49.0|             72|             26|
|       Aherne, Brian|             57.0|             57|             57|
|     Ahlstedt, B�rje|             81.0|             81|             81|
|       Aiello, Danny|             12.5|             20|              5|
|         Akan, Tarik|             53.0|             53|             53|
|    Albaic�n, Rafael|             55.0|             55|             55|
|      Albert, Edward|             82.0|             82|             82|
|          Alda, Alan|             32.5|             53|             12|
|         Allen, Bill|             75.0|           

### Question # 11: List the number of movies released in each decade starting from 1960’s.

In [17]:
row1 = data.agg({"Year": "max"}).collect()[0]
print('Maximum Year to Determine total Decade is :',max(row1))
print("")
dataCollect=data.rdd.toLocalIterator()
first_decade=0
second_decade=0
third_decade=0
fourth_decade=0

for row in dataCollect:
    if (row['Year']>=1960 and row['Year']<=1969):
        first_decade=first_decade+1
    elif (row['Year']>=1970 and row['Year']<=1979):
        second_decade=second_decade+1
    elif (row['Year']>=1980 and row['Year']<=1989):
        third_decade=third_decade+1
    elif (row['Year']>=1990 and row['Year']<=1999):
        fourth_decade=fourth_decade+1

print("Number of movies in Decade Between 1960-1969 is : ",first_decade)
print("Number of movies in Decade Between 1970-1979 is : ",second_decade)
print("Number of movies in Decade Between 1980-1989 is : ",third_decade)
print("Number of movies in Decade Between 1990-1999 is : ",fourth_decade)
print("")
print("To Prove this is correct we check by following:-")
print("The sum of all decades = ",first_decade+second_decade+third_decade+fourth_decade)
print("The sum through pyspark query = ",data.filter(data.Year>=1960).count())


Maximum Year to Determine total Decade is : 1997

Number of movies in Decade Between 1960-1969 is :  154
Number of movies in Decade Between 1970-1979 is :  244
Number of movies in Decade Between 1980-1989 is :  607
Number of movies in Decade Between 1990-1999 is :  345

To Prove this is correct we check by following:-
The sum of all decades =  1350
The sum through pyspark query =  1350


### Question # 12: Find the number of movies released in each year.

In [18]:
data.groupBy("Year").count().show()

+----+-----+
|Year|count|
+----+-----+
|1959|    9|
|1990|  105|
|1975|   23|
|1977|   29|
|1924|    3|
|1974|   23|
|1927|    3|
|1955|   20|
|1978|   18|
|1925|    1|
|1961|   14|
|1942|    6|
|1944|    4|
|1939|   11|
|1952|   10|
|1956|   15|
|1934|    4|
|1988|   96|
|1997|    1|
|1968|   20|
+----+-----+
only showing top 20 rows



### Question # 13: Find the number of movies released in each year of each genre. Consider only the movies with length greater than 100 minutes.

In [19]:
data.filter(data.Length>100).groupBy("Year","Genre").count().show()

+----+---------------+-----+
|Year|          Genre|count|
+----+---------------+-----+
|1989|         Action|   10|
|1988|         Comedy|   10|
|1966|          Drama|    6|
|1985|         Comedy|    6|
|1957|        Western|    1|
|1928|          Drama|    2|
|1970|        Western|    2|
|1975|          Drama|    5|
|1969|         Action|    1|
|1940|         Action|    1|
|1939|        Western|    3|
|1987|         Comedy|   13|
|1982|Science Fiction|    1|
|1951|            War|    1|
|1965|          Music|    1|
|1939|         Comedy|    1|
|1964|          Music|    1|
|1987|            War|    1|
|1962|         Action|    1|
|1966|        Mystery|    3|
+----+---------------+-----+
only showing top 20 rows



### Question # 14: Sort the movie’s release before 1990 by the title.

In [20]:
data.select('Title').filter(data.Year<1990).sort(data.Title).show()

+--------------------+
|               Title|
+--------------------+
|2001: A Space Ody...|
|             48 Hrs.|
|               8 1/2|
|A Big Hand for th...|
|  A Child Is Waiting|
|A Chorus Line, Th...|
|  A Clockwork Orange|
|A Coeur Joie, (He...|
|   A Cry in the Dark|
|  A Dry White Season|
|      A Fine Madness|
| A Fish Called Wanda|
|A Fistful of Dollars|
|     A Guy Named Joe|
|    A Lesson in Love|
|A Little Night Music|
|     A Man & a Woman|
|A Man & a Woman: ...|
|A Man for All Sea...|
|    A Matter of Time|
+--------------------+
only showing top 20 rows



### Question # 15: Find the movies with long titles. A movie title is considered long if it is greater than 50 alphabets.

In [21]:
from pyspark.sql import functions as f
data.filter(f.length(data.Title)>50).select('Title').show()

+--------------------+
|               Title|
+--------------------+
|Fawlty Towers, Go...|
|Unnamable II, The...|
|Industrial Sympho...|
+--------------------+

