In [None]:
#Reading Data
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('your_app_name').getOrCreate()

imdb_df = spark.read.csv('IMDB_movies_archive1.csv', header=True, inferSchema=True)
merged_data_df = spark.read.csv('merged_data.csv', header=True, inferSchema=True)
netflix_df = spark.read.csv('Netflix_movies_archive2.csv', header=True, inferSchema=True)

In [None]:
# Rename columns
netflix_df = netflix_df.withColumnRenamed('MovieTitle', 'title')
imdb_df = imdb_df.withColumnRenamed('title_x', 'title').withColumnRenamed('avg_vote_x', 'avg_vote')
merged_data_df = merged_data_df.withColumnRenamed('movie', 'title')

In [None]:
# Normalize ratings
imdb_df = imdb_df.withColumn('avg_vote', col('avg_vote') / 2)
merged_data_df = merged_data_df.withColumn('rating', col('rating') / 2)

In [None]:
# Group by 'title' and calculate the average rating for Netflix dataset
netflix_avg_rating = netflix_df.groupBy('title').agg(mean('Rating').alias('avg_rating_netflix'))


In [None]:
# Merge datasets using 'title' as the key
combined_df = netflix_avg_rating.join(imdb_df, 'title', 'outer')
combined_df = combined_df.join(merged_data_df, 'title', 'outer')

In [None]:
# Handling missing values for numeric columns
numeric_columns = ['avg_rating_netflix', 'avg_vote', 'rating']
for col_name in numeric_columns:
    combined_df = combined_df.na.fill(combined_df.select(mean(col_name)).collect()[0][0], subset=[col_name])


In [None]:
# Calculating the overall average rating
combined_df = combined_df.withColumn('average_rating', (col('avg_rating_netflix') + col('avg_vote') + col('rating')) / len(numeric_columns))


In [None]:
# Removing duplicates based on 'title' and 'imdb_title_id'
combined_df = combined_df.dropDuplicates(['title', 'imdb_title_id'])


In [None]:
# Saving the final dataset
cleaned_merged_dataset_path = 'cleaned_merged_dataset_optimized_finale.csv'
combined_df.write.csv(cleaned_merged_dataset_path, header=True)

print("Cleaned and merged dataset saved to:", cleaned_merged_dataset_path)