In [104]:
import pandas as pd

In [105]:
movies = pd.read_csv("movies.dat",
                    sep="::",
                    engine = "python",
                    names = ["MovieID", "Title", "Genres"],
                    encoding ="latin-1")
    

In [106]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [107]:
len(movies)

3883

In [108]:
movies.tail()

Unnamed: 0,MovieID,Title,Genres
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama
3882,3952,"Contender, The (2000)",Drama|Thriller


In [109]:
movies.isnull().any()

MovieID    False
Title      False
Genres     False
dtype: bool

In [110]:
ratings = pd.read_csv("ratings.dat",
                     sep="::",
                     engine = "python",
                     names = ["UserID", "MovieID", "Rating", "Timestamp"])

In [111]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [112]:
ratings.drop("Timestamp", axis="columns")

Unnamed: 0,UserID,MovieID,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [113]:
num_of_ratings = ratings.groupby("MovieID")["Rating"].transform('count')
final_ratings = ratings[num_of_ratings > 5]
print(num_of_ratings)
print(final_ratings)

0          1725
1           525
2           636
3          1315
4          1703
           ... 
1000204     373
1000205    1229
1000206     478
1000207     344
1000208    2269
Name: Rating, Length: 1000209, dtype: int64
         UserID  MovieID  Rating  Timestamp
0             1     1193       5  978300760
1             1      661       3  978302109
2             1      914       3  978301968
3             1     3408       4  978300275
4             1     2355       5  978824291
...         ...      ...     ...        ...
1000204    6040     1091       1  956716541
1000205    6040     1094       5  956704887
1000206    6040      562       5  956704746
1000207    6040     1096       4  956715648
1000208    6040     1097       4  956715569

[999416 rows x 4 columns]


In [114]:
ratings_avg = final_ratings.groupby("MovieID")["Rating"].mean().reset_index()

In [115]:
ratings_avg.head()

Unnamed: 0,MovieID,Rating
0,1,4.146846
1,2,3.201141
2,3,3.016736
3,4,2.729412
4,5,3.006757


In [116]:
len(ratings_avg)

3377

In [117]:
movie_ratings = pd.merge(ratings_avg, movies, on="MovieID", how="left")

In [118]:
len(movie_ratings)

3377

In [119]:
movie_ratings.head()

Unnamed: 0,MovieID,Rating,Title,Genres
0,1,4.146846,Toy Story (1995),Animation|Children's|Comedy
1,2,3.201141,Jumanji (1995),Adventure|Children's|Fantasy
2,3,3.016736,Grumpier Old Men (1995),Comedy|Romance
3,4,2.729412,Waiting to Exhale (1995),Comedy|Drama
4,5,3.006757,Father of the Bride Part II (1995),Comedy


movie_ratings = movie_ratings.sort_values(by="Rating", ascending=False)

In [120]:
movie_ratings = movie_ratings.sort_values(by="Rating", ascending=False)

movie_ratings.head()

In [121]:
movie_ratings.head()

Unnamed: 0,MovieID,Rating,Title,Genres
50,53,4.75,Lamerica (1994),Drama
2113,2503,4.666667,"Apple, The (Sib) (1998)",Drama
2474,2905,4.608696,Sanjuro (1962),Action|Adventure
1668,2019,4.56051,Seven Samurai (The Magnificent Seven) (Shichin...,Action|Drama
293,318,4.554558,"Shawshank Redemption, The (1994)",Drama


In [125]:
def fix_title(title): 
    """will take a title as the input and fix the formatting when theres a comma""" 
    if "," in title: 
        parts = title.rsplit('(', 1) #should split the year off 
        name_part = parts[0].strip() #removes spaces 
        year_part = "(" + parts[1] #seperates the year from the rest of the title 
        if ',' in name_part: #split the name 
            name, article = name_part.rsplit(",", 1) 
            if article.lower() in {"the", "an", "a"}: #reformat the string for the title                      
                return f"{article.strip()} {name.strip()} {year_part.strip()}" 
                
    #if no changes are needed           
    return title

In [126]:
movie_ratings['Title'] = movie_ratings["Title"].apply(fix_title)
movie_ratings.to_csv("/Users/rkturer/Desktop/PersonalCS/movie-recommender/data/movie-ratings.csv")