In [None]:
# Dependencies
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Plots displayed in notebook
%matplotlib inline

In [None]:
# Load movie data
movies_df = pd.read_csv('../data/movie.csv')
print(movies_df.head())

### Movie Data Overview
Data parameters seem to be `movieId`, `title` with year appended to the movie name and `genres` seperated by `|`. 

In [None]:
# Splitting the genres column
all_genres = movies_df['genres'].str.split('|', expand=True).stack()

# Converting to DataFrame and resetting index
all_genres_df = all_genres.to_frame(name='genre').reset_index(drop=True)

# Plotting the frequency of each genre
plt.figure(figsize=(10, 8))
sns.countplot(y='genre', data=all_genres_df, order=all_genres_df['genre'].value_counts().index)
plt.title('Movie Genre Distribution')
plt.xlabel('Number of Movies')
plt.ylabel('Genre')
plt.show()

### Genre Distribution Analysis
Processed the 'genres' column to split genre in each movie and give them their own index. 

Then used these inices to find the frequency distribution for each genre which is attached below.

![Movie genre frequency distribution](https://i.postimg.cc/63PBGWjR/temp-Image-KAu-Jx-N.avif)

In [None]:
# Load genome_scores and genome_tags data
genome_scores_df = pd.read_csv('../data/genome_scores.csv')
genome_tags_df = pd.read_csv('../data/genome_tags.csv')

print(genome_scores_df.head())
print(genome_tags_df.head())

### Tag Data Overview
Data parameters for `genome_scores` seem to be `movieId`, `tagId` and `relevance`.

Data parameters for `genome_tags` seem to be `tagId` and `tag`.

We'll merge these two dataframes and use 'tagId' as the key value.

In [None]:
# Merging the dataframes
tag_info_df = pd.merge(genome_scores_df, genome_tags_df, on='tagId')

# Plotting the tags with highest frequency
most_common_tags = tag_info_df['tag'].value_counts().head(20)
plt.figure(figsize=(10, 8))
sns.barplot(y=most_common_tags.index, x=most_common_tags.values)
plt.title('Top 20 Most Common Tags')
plt.xlabel('Number of Occurrences')
plt.ylabel('Tags')
plt.show()

### Top Frequent Tag Analysis
Plotted the highest frequency tags (attached below). This data seems to be skewed due to user generated events like oscars, etc. 

Since this is basically useless, will also analyze the relevancy scores for the tags to determine if the tag is of any use at all.

![Top 20 Most Common Tags](https://i.postimg.cc/prw9tYKm/temp-Imagek-Tm4-YG.avif)

In [None]:
# Merge by movieId and tag and then aggregate by mean relevance
tag_relevance_df = tag_info_df.groupby(['movieId', 'tag']).agg({'relevance': 'mean'})

# Get top tags by relevance
top_tags = tag_relevance_df['relevance'].sort_values(ascending=False).head(20)

# Plotting relevance scores
plt.figure(figsize=(10, 6))
sns.histplot(tag_info_df['relevance'], bins=30)
plt.title('Distribution of Tag Relevance Scores')
plt.xlabel('Relevance Score')
plt.ylabel('Frequency')
plt.show()

### Tag Relevancy Scores

The graph (attached below) indicates that most tags have a very low relevancy scores so it would make sense to set a threshold relavance score to filter out the less relevant tags.

![Plot of Tag Relevance Scores](https://i.postimg.cc/9XZ7j0Ls/temp-Image8g-TJBT.avif)