In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import os

In [34]:
# Define the analysis directory globally
analysis_dir = 'analysis_plots'
os.makedirs(analysis_dir, exist_ok=True)

In [35]:
# Load the data
df = pd.read_csv('processed_movies.csv')

In [36]:
# Display basic information about the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24402 entries, 0 to 24401
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   movie title   24402 non-null  object
 1   User Rating   24402 non-null  object
 2   Generes       24402 non-null  object
 3   Overview      24402 non-null  object
 4   Plot Kyeword  24402 non-null  object
 5   Director      24402 non-null  object
 6   Top 5 Casts   24402 non-null  object
 7   Writer        24402 non-null  object
 8   year          24402 non-null  object
 9   path          24402 non-null  object
 10  Rating_bin    24402 non-null  object
dtypes: object(11)
memory usage: 2.0+ MB
None


In [37]:
# Display the first few rows
df.head()

Unnamed: 0,movie title,User Rating,Generes,Overview,Plot Kyeword,Director,Top 5 Casts,Writer,year,path,Rating_bin
0,Top Gun: Maverick,['86quantilevalue'],"['Action', 'Drama']","['After', 'more', 'than', 'thirty', 'years', '...","['fighterjet', 'sequel', 'u.s.navy', 'fightera...",['JosephKosinski'],"['JackEppsJr.', 'PeterCraig', 'TomCruise', 'Je...",['JimCash'],['2022'],/title/tt1745960/,"['8.3333', '8.6667']"
1,Jurassic World Dominion,['78quantilevalue'],"['Action', 'Adventure', 'Sci-Fi']","['Four', 'years', 'after', 'the', 'destruction...","['dinosaur', 'jurassicpark', 'tyrannosaurusrex...",['ColinTrevorrow'],"['ColinTrevorrow', 'DerekConnolly', 'ChrisPrat...",['EmilyCarmichael'],['2022'],/title/tt8041270/,"['5.6667', '6.0000']"
2,Top Gun,['89quantilevalue'],"['Action', 'Drama']","['As', 'students', 'at', 'the', 'United', 'Sta...","['pilot', 'malecamaraderie', 'u.s.navy', 'grum...",['TonyScott'],"['JackEppsJr.', 'EhudYonay', 'TomCruise', 'Tim...",['JimCash'],['1986'],/title/tt0092099/,"['6.6667', '7.0000']"
3,Lightyear,['73quantilevalue'],"['Animation', 'Action', 'Adventure']","['While', 'spending', 'years', 'attempting', '...","['galaxy', 'spaceship', 'robot', 'rocket', 'sp...",['AngusMacLane'],"['JasonHeadley', 'MatthewAldrich', 'ChrisEvans...",['AngusMacLane'],['2022'],/title/tt10298810/,"['5.0000', '5.3333']"
4,Spiderhead,['69quantilevalue'],"['Action', 'Crime', 'Drama']","['In', 'the', 'near', 'future,', 'convicts', '...","['discover', 'medical', 'test', 'reality', 'fi...",['JosephKosinski'],"['RhettReese', 'PaulWernick', 'ChrisHemsworth'...",['GeorgeSaunders'],['2022'],/title/tt9783600/,"['5.3333', '5.6667']"


In [38]:
# Clean and preprocess the data
def parse_list(x):
    try:
        return ast.literal_eval(x)
    except:
        return []

def parse_user_rating(x):
    try:
        rating = ast.literal_eval(x)[0]
        return int(rating.replace('quantilevalue', ''))
    except:
        return None

In [39]:
df['Generes'] = df['Generes'].apply(parse_list)
df['Overview'] = df['Overview'].apply(parse_list)
df['Plot Kyeword'] = df['Plot Kyeword'].apply(parse_list)
df['Top 5 Casts'] = df['Top 5 Casts'].apply(parse_list)
df['User Rating'] = df['User Rating'].apply(parse_user_rating)
df['year'] = df['year'].apply(lambda x: int(parse_list(x)[0]) if parse_list(x) else None)


In [40]:
# Remove rows with None values
df = df.dropna()

In [41]:
# Analyze genre distribution
genres = [genre for genres in df['Generes'] for genre in genres]
genre_counts = pd.Series(genres).value_counts()

plt.figure(figsize=(12, 6))
sns.barplot(x=genre_counts.index, y=genre_counts.values)
plt.title('Distribution of Movie Genres')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(analysis_dir, 'genre_distribution.png'))
plt.close()

In [42]:
# Analyze year distribution
plt.figure(figsize=(12, 6))
df['year'].value_counts().sort_index().plot(kind='bar')
plt.title('Distribution of Movies by Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig(os.path.join(analysis_dir, 'year_distribution.png'))
plt.close()

In [43]:
# Analyze user ratings
plt.figure(figsize=(12, 6))
sns.histplot(df['User Rating'], bins=20, kde=True)
plt.title('Distribution of User Ratings')
plt.xlabel('User Rating (Quantile)')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig(os.path.join(analysis_dir, 'user_rating_distribution.png'))
plt.close()

In [44]:
# Analyze relationship between user rating and year
plt.figure(figsize=(12, 6))
sns.scatterplot(x='year', y='User Rating', data=df)
plt.title('User Ratings vs. Release Year')
plt.xlabel('Release Year')
plt.ylabel('User Rating (Quantile)')
plt.tight_layout()
plt.savefig(os.path.join(analysis_dir, 'user_rating_vs_year.png'))
plt.close()

In [45]:
# Analyze top cast members
top_cast = [cast for casts in df['Top 5 Casts'] for cast in casts]
top_cast_counts = pd.Series(top_cast).value_counts().head(10)

In [46]:
plt.figure(figsize=(12, 6))
sns.barplot(x=top_cast_counts.index, y=top_cast_counts.values)
plt.title('Top 10 Cast Members')
plt.xlabel('Cast Member')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(os.path.join(analysis_dir, 'top_cast_members.png'))
plt.close()

In [47]:
# Analyze correlation between features
numeric_df = df[['User Rating', 'year']]
correlation = numeric_df.corr()

In [48]:
plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.savefig(os.path.join(analysis_dir, 'correlation_heatmap.png'))
plt.close()

In [49]:
# Print summary statistics
print(df.describe())

        User Rating          year
count  23433.000000  23433.000000
mean      43.308198   1999.212820
std       26.916357     22.133057
min        0.000000   1906.000000
25%       20.000000   1987.000000
50%       43.000000   2007.000000
75%       66.000000   2017.000000
max       90.000000   2028.000000


In [50]:
# Additional insights
print("Top 5 movies by User Rating:")
print(df.nlargest(5, 'User Rating')[['movie title', 'User Rating', 'year']])

print("\nMost common plot keywords:")
plot_keywords = [keyword for keywords in df['Plot Kyeword'] for keyword in keywords]
print(pd.Series(plot_keywords).value_counts().head())

print("\nAverage User Rating by Genre:")
genre_ratings = df.explode('Generes').groupby('Generes')['User Rating'].mean().sort_values(ascending=False)
print(genre_ratings)

# Save text-based insights to a file
with open(os.path.join(analysis_dir, 'text_insights.txt'), 'w') as f:
    f.write("Top 5 movies by User Rating:\n")
    f.write(df.nlargest(5, 'User Rating')[['movie title', 'User Rating', 'year']].to_string())
    f.write("\n\nMost common plot keywords:\n")
    f.write(pd.Series(plot_keywords).value_counts().head().to_string())
    f.write("\n\nAverage User Rating by Genre:\n")
    f.write(genre_ratings.to_string())

print(f"Analysis complete. All plots have been saved in the '{analysis_dir}' directory.")

Top 5 movies by User Rating:
                movie title  User Rating    year
13            Jurassic Park           90  1993.0
16               The Batman           90  2022.0
19           Jurassic World           90  2015.0
24  Spider-Man: No Way Home           90  2021.0
34                     Dune           90  1984.0

Most common plot keywords:
murder              1190
femalenudity        1152
psychotronicfilm     964
revenge              645
sexscene             635
Name: count, dtype: int64

Average User Rating by Genre:
Generes
Biography     57.129132
Reality-TV    56.000000
Music         54.930180
Sport         52.641399
Comedy        49.814220
Romance       49.801920
Fantasy       49.118182
Mystery       48.971513
History       46.882265
Musical       45.451104
Drama         45.264452
Animation     45.241379
War           45.205882
Horror        44.183153
Sci-Fi        43.697911
Family        42.561922
Adventure     42.530780
Thriller      41.265384
Crime         41.029067
Fil