The MovieLens dataset contains 1 million ratings collected from 6,000 users on 4,000 movies. It’s spread across three tables: ratings, user information, and movie information. This has been collected from users of MovieLens in the late 1990s and early 2000s. The data provides movie ratings, movie metadata (genres and year), and demographic data about the users (age, zip code, gender identification, and occupation).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
usernames = ['user_id', 'gender', 'age', 'occupation', "zip"]
users = pd.read_table('movielens/users.dat', sep="::", header=None, names=usernames, 
                      engine='python')
print(users.shape)

In [None]:
ratingnames = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_table("movielens/ratings.dat", sep="::", header=None,
                      names=ratingnames, engine="python")
ratings["rating"] = pd.to_numeric(ratings["rating"], errors='coerce')
print(ratings.shape)

In [None]:
movienames = ['movie_id', "title", "genres"]
movies = pd.read_table("movielens/movies.dat", sep="::", header=None,
                      names=movienames, engine="python")
print(movies.shape)

In [None]:
print("First 5 rows of users:")
print(users.head())
print("Last 5 rows of users:")
print(users.tail())

In [None]:
print("First 5 rows of ratings:")
print(ratings.head())
print("Last 5 rows of ratings:")
print(ratings.tail())

In [None]:
print("First 5 rows of movies:")
print(movies.head())
print("Last 5 rows of movies:")
print(movies.tail())

In [None]:
users.isna().sum()

In [None]:
ratings.isna().sum()

In [None]:
movies.isna().sum()

In [None]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
print(ratings)

In [None]:
print("Total number of users who have logged:", len(ratings["user_id"].unique()))
print("Total number of movies logged:", len(ratings["movie_id"].unique()))
print("Total number of ratings logged:", len(ratings))

In [None]:
rating_counts = ratings['rating'].value_counts().sort_index()
# Create the line plot
plt.plot(rating_counts.index, rating_counts.values, marker='o', linestyle='-', color='b')
# Annotate each point with its value
for i, value in enumerate(rating_counts.values):
    plt.text(rating_counts.index[i], value, str(value), ha='left', va='top')
plt.xlabel("Rating")
plt.ylabel("Count")
plt.title("Ratings Distribution")  
plt.show()

In [None]:
movies['genre_split'] = movies['genres'].str.split('|')
# Flatten the list of genres
all_genres = movies['genre_split'].explode()

# Count the frequency of each genre
genre_counts = all_genres.value_counts()

# Display the result
plt.figure(figsize=(10,6))
genre_plot = genre_counts.plot(kind='bar', color='#4A4A4A', edgecolor='black')

for bar in genre_plot.patches:
    height = bar.get_height()
    # Add text above the bar
    plt.text(bar.get_x() + bar.get_width() / 2, height, str(height), ha='center', va='bottom')

plt.plot(genre_counts.index, genre_counts, color='red', marker='o', label='Line')

# Adding labels and title
plt.xlabel('Genres')
plt.ylabel('Frequency')
plt.title('Genre Frequency Count')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Show the plot
plt.tight_layout()  # Adjust layout to prevent clipping of labels
plt.show()


In [None]:
data = pd.merge(pd.merge(ratings, users), movies)
print("First 5 rows of the merged data:")
print(data.head())
print("Last 5 rows of the merged data:")
print(data.tail())
print("Shape of the merged data:")
print(data.shape)

In [None]:
print("Log of user with id 1:")
data.loc[data['user_id'] == 1, ['title', 'rating']]

In [None]:
print("Median rating:", data["rating"].median())
print("Most frequent rating:", data["rating"].mode())
print("Deviation of ratings:", data["rating"].std())

In [None]:
sorted_data = data.sort_values("rating", ascending=False)
print("Top 25 movies:")
print(sorted_data[['title','rating','genre_split']].head(25))

In [None]:
print("Bottom 25 movies:")
print(sorted_data[['title','rating','genre_split']].tail(25))

In [None]:
subset = data[['title', 'rating', 'gender']]
subset.head()

In [None]:
mean_ratings = subset.groupby(['title','gender'])['rating'].mean().unstack()
print(mean_ratings)

In [None]:
ratings_by_title = data.groupby('title').size()
ratings_by_title = ratings_by_title.sort_values(ascending=False)
ratings_by_title.head(10)

In [None]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles

In [None]:
mean_ratings = mean_ratings.loc[active_titles]
mean_ratings

In [None]:
len(mean_ratings.loc[mean_ratings['F'].isnull(), :].index.unique())

In [None]:
mean_ratings.sort_values(by='F', ascending=False).head(10)

In [None]:
mean_ratings.sort_values(by='M', ascending=False).head(10)

In [None]:
mean_ratings_male = mean_ratings.sort_values(by='M', ascending=False).head(10)
ratings_plot_by_male = mean_ratings_male.plot(kind='bar', figsize=(20, 10))
plt.xticks(rotation=45, ha='right')
plt.xlabel('Movie Title')
plt.ylabel('Average Rating')
plt.title("Top 10 Movies by average male rating")
for bar in ratings_plot_by_male.patches:
    height = bar.get_height()
    # Add text above the bar
    plt.text(
        bar.get_x() + bar.get_width() / 2,  # X position (center of bar)
        height,  # Y position (bar height)
        f'{height:.2f}',  # Format height to 2 decimal places
        ha='center', va='bottom'  # Align text
    )

In [None]:
mean_ratings_female = mean_ratings.sort_values(by='F', ascending=False).head(10)
ratings_plot_by_female = mean_ratings_female.plot(kind='bar', figsize=(20, 10))
plt.xticks(rotation=45, ha='right')
plt.xlabel('Movie Title')
plt.ylabel('Average Rating')
plt.title("Top 10 Movies by average female rating")
for bar in ratings_plot_by_female.patches:
    height = bar.get_height()
    # Add text above the bar
    plt.text(
        bar.get_x() + bar.get_width() / 2,  # X position (center of bar)
        height,  # Y position (bar height)
        f'{height:.2f}',  # Format height to 2 decimal places
        ha='center', va='bottom'  # Align text
    )

In [None]:
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
sorted_diff = mean_ratings.sort_values(by='diff')
print(sorted_diff)

In [None]:
sorted_diff[::-1][:10]

In [None]:
mean_ratings_diff = mean_ratings.sort_values(by='diff', ascending=False).head(10)
ratings_plot_by_diff = mean_ratings_diff.plot(kind='bar', figsize=(20, 10))
plt.xticks(rotation=45, ha='right')
plt.xlabel('Movie Title')
plt.ylabel('Average Rating')
plt.title("Top 10 Movies by average difference in rating")
for bar in ratings_plot_by_diff.patches:
    height = bar.get_height()
    # Add text above the bar
    plt.text(
        bar.get_x() + bar.get_width() / 2,  # X position (center of bar)
        height,  # Y position (bar height)
        f'{height:.2f}',  # Format height to 2 decimal places
        ha='center', va='bottom'  # Align text
    )

The disagreement between user ratings can be measured by calculating the standard deviation of the ratings

In [None]:
rating_std_by_title = data.groupby("title")['rating'].std()
rating_std_by_title = rating_std_by_title.loc[active_titles]
rating_std_by_title = rating_std_by_title.sort_values(ascending=False)
rating_std_by_title.head(10)