In [1]:
pip install pyspark


Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=3b44b7dae0b92cc7bd31c14b4d170b382c7fa8b26ed5ea00e0ba530f4cdfd01f
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create a Spark session
spark = SparkSession.builder.appName("MoviesAnalysis").getOrCreate()

# Load the dataset Movies.csv into a DataFrame
movies_df = spark.read.csv("Movies.csv", header=True)

movies_df.show()

+----+------+--------------------+-------+--------------------+-----------------+--------------------+----------+------+-------------------+
|Year|Length|               Title|  Genre|               Actor|          Actress|            Director|Popularity|Awards|              Image|
+----+------+--------------------+-------+--------------------+-----------------+--------------------+----------+------+-------------------+
|1990|   111|Tie Me Up! Tie Me...| Comedy|     BanderasAntonio|    AbrilVictoria|      AlmodóvarPedro|        68|    No|   NicholasCage.png|
|1991|   113|          High Heels| Comedy|          BoséMiguel|    AbrilVictoria|      AlmodóvarPedro|        68|    No|   NicholasCage.png|
|1983|   104|        Dead ZoneThe| Horror|   WalkenChristopher|      AdamsBrooke|     CronenbergDavid|        79|    No|   NicholasCage.png|
|1979|   122|                Cuba| Action|         ConnerySean|      AdamsBrooke|       LesterRichard|         6|    No|    seanConnery.png|
|1978|    94|

In [4]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("Movie Analysis").getOrCreate()

# Load the Movies.csv file into a DataFrame
movies_df = spark.read.csv("Movies.csv", header=True, inferSchema=True)

# 6. Find the title, year, and director of action films that won an award.
action_award_winners = movies_df.filter((movies_df.Genre == "Action") & (movies_df.Awards!= "No")) \
                               .select("Title", "Year", "Director")
action_award_winners.show()

# 7. For each award-winning actor, find the movies he acted in. Print the names of the movies and the director of the movie.
award_winning_actors = movies_df.filter(movies_df.Actor.contains("Yes")) \
                               .select("Title", "Director", "Actor")
award_winning_actors.show()

# 8. Find the top 10 most popular movies that did not win an award.
most_popular_no_award = movies_df.filter(movies_df.Awards == "No") \
                               .select("Title", "Popularity") \
                               .orderBy("Popularity", ascending=False) \
                               .limit(10)
most_popular_no_award.show()

# 9. Find the 10 least popular movies that were released before 1980.
least_popular_before_1980 = movies_df.filter(movies_df.Year < 1980) \
                                   .select("Title", "Popularity") \
                                   .orderBy("Popularity", ascending=True) \
                                   .limit(10)
least_popular_before_1980.show()

# 10. Sort the movie's release before 1990 by the title.
movies_before_1990 = movies_df.filter(movies_df.Year < 1990) \
                             .select("Title", "Year") \
                             .orderBy("Title", ascending=True)
movies_before_1990.show()

+-----+----+--------+
|Title|Year|Director|
+-----+----+--------+
+-----+----+--------+

+-----+--------+-----+
|Title|Director|Actor|
+-----+--------+-----+
+-----+--------+-----+

+--------------------+----------+
|               Title|Popularity|
+--------------------+----------+
|        Five Corners|        88|
|Ballad of Narayam...|        88|
|         Let It Ride|        88|
|        Final Notice|        88|
|      New Year's Day|        88|
| Guilty by Suspicion|        88|
|   Fellini Satyricon|        88|
|           Raw Nerve|        88|
|     Time MachineThe|        88|
| Long Voyage HomeThe|        88|
+--------------------+----------+

+------------------+----------+
|             Title|Popularity|
+------------------+----------+
|   White Lightning|      NULL|
|      Drop KickThe|      NULL|
|      Desert Rider|      NULL|
| Bank on the Stars|      NULL|
|           Shalako|         0|
|           Airport|         0|
|     Anna Christie|         0|
|Shout at the Devil| 

In [5]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans, BisectingKMeans
from pyspark.ml.stat import Correlation

# Create a SparkSession
spark = SparkSession.builder.appName("Wine Analysis").getOrCreate()

# Load the wine.csv file into a DataFrame
wine_df = spark.read.csv("wine.csv", header=True, inferSchema=True)

# 11. Explore and preprocess the wine data set
# Check the first 10 rows of the DataFrame
wine_df.show(10)

# Check the summary statistics of the DataFrame
wine_df.describe().show()

# Check for missing values
wine_df.show()

# Drop rows with missing values
wine_df_no_na = wine_df.na.drop()

# Check for outliers or noise in the data
wine_df_no_na.createOrReplaceTempView("wine_table")
spark.sql("SELECT * FROM wine_table WHERE alcohol > 17 OR alcohol < 9").show()







+-------+----------+----+------------+---------+-------------+----------+--------------------+---------------+---------------+----+-----+-------+
|Alcohol|Malic_Acid| Ash|Ash_Alcanity|Magnesium|Total_Phenols|Flavanoids|Nonflavanoid_Phenols|Proanthocyanins|Color_Intensity| Hue|OD280|Proline|
+-------+----------+----+------------+---------+-------------+----------+--------------------+---------------+---------------+----+-----+-------+
|  14.23|      1.71|2.43|        15.6|      127|          2.8|      3.06|                0.28|           2.29|           5.64|1.04| 3.92|   1065|
|   13.2|      1.78|2.14|        11.2|      100|         2.65|      2.76|                0.26|           1.28|           4.38|1.05|  3.4|   1050|
|  13.16|      2.36|2.67|        18.6|      101|          2.8|      3.24|                 0.3|           2.81|           5.68|1.03| 3.17|   1185|
|  14.37|      1.95| 2.5|        16.8|      113|         3.85|      3.49|                0.24|           2.18|            7.

In [6]:
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler

# Assuming you have a DataFrame named 'wine_df_no_na'
# Create a vector assembler for the selected features
vector_assembler = VectorAssembler(inputCols=["Alcohol", "Malic_Acid"], outputCol="features")
wine_df_assembled = vector_assembler.transform(wine_df_no_na)

# Compute the correlation between Alcohol and Malic_Acid
corr_matrix = Correlation.corr(wine_df_assembled, "features", "pearson").collect()[0][0]

# Extract the correlation value
correlation_alcohol_malic_acid = corr_matrix[1, 0]

print("Correlation between Alcohol and Malic_Acid:", correlation_alcohol_malic_acid)


Correlation between Alcohol and Malic_Acid: 0.09439694091041397


In [8]:
# Preprocess the data by assembling features into a vector column
assembler = VectorAssembler(inputCols=["Alcohol", "Malic_Acid", "Ash",
                                       "Magnesium", "Total_Phenols", "Flavanoids", "Nonflavanoid_Phenols",
                                       "Proanthocyanins", "Color_Intensity", "Hue", "OD280",
                                       "Proline"], outputCol="features")
wine_vector_df = assembler.transform(wine_df_no_na)

In [9]:
# K-means clustering
kmeans = KMeans(k=3, featuresCol="features", predictionCol="cluster")
model = kmeans.fit(wine_vector_df)
wine_kmeans_df = model.transform(wine_vector_df)
wine_kmeans_df.show()


+-------+----------+----+------------+---------+-------------+----------+--------------------+---------------+---------------+----+-----+-------+--------------------+-------+
|Alcohol|Malic_Acid| Ash|Ash_Alcanity|Magnesium|Total_Phenols|Flavanoids|Nonflavanoid_Phenols|Proanthocyanins|Color_Intensity| Hue|OD280|Proline|            features|cluster|
+-------+----------+----+------------+---------+-------------+----------+--------------------+---------------+---------------+----+-----+-------+--------------------+-------+
|  14.23|      1.71|2.43|        15.6|      127|          2.8|      3.06|                0.28|           2.29|           5.64|1.04| 3.92|   1065|[14.23,1.71,2.43,...|      2|
|   13.2|      1.78|2.14|        11.2|      100|         2.65|      2.76|                0.26|           1.28|           4.38|1.05|  3.4|   1050|[13.2,1.78,2.14,1...|      2|
|  13.16|      2.36|2.67|        18.6|      101|          2.8|      3.24|                 0.3|           2.81|           5.68

In [10]:
# Bisecting K-means clustering
bisectingKmeans = BisectingKMeans(k=3, featuresCol="features", predictionCol="cluster")
model_bisecting = bisectingKmeans.fit(wine_vector_df)
wine_bisecting_df = model_bisecting.transform(wine_vector_df)
wine_bisecting_df.show()

+-------+----------+----+------------+---------+-------------+----------+--------------------+---------------+---------------+----+-----+-------+--------------------+-------+
|Alcohol|Malic_Acid| Ash|Ash_Alcanity|Magnesium|Total_Phenols|Flavanoids|Nonflavanoid_Phenols|Proanthocyanins|Color_Intensity| Hue|OD280|Proline|            features|cluster|
+-------+----------+----+------------+---------+-------------+----------+--------------------+---------------+---------------+----+-----+-------+--------------------+-------+
|  14.23|      1.71|2.43|        15.6|      127|          2.8|      3.06|                0.28|           2.29|           5.64|1.04| 3.92|   1065|[14.23,1.71,2.43,...|      2|
|   13.2|      1.78|2.14|        11.2|      100|         2.65|      2.76|                0.26|           1.28|           4.38|1.05|  3.4|   1050|[13.2,1.78,2.14,1...|      2|
|  13.16|      2.36|2.67|        18.6|      101|          2.8|      3.24|                 0.3|           2.81|           5.68