In [51]:
#libraries and packages
import numpy as np
import numpy.ma as ma
import pandas as pd
import csv

In [52]:
ratings = pd.read_csv('../../Dataset/ml-latest-small/ratings.csv')
movies = pd.read_csv('../../Dataset/ml-latest-small/movies.csv')


#we are going to extract first 5 users for debugging causes
#the first 561 rows are the first 5 users
#code must be written flexible to change the data into the 25m dataset
#first we have to merge all the data movieid, userid, rating, year, genres, etc. so everything is all together

#goal is to:
#1- sort the data descending according to rating counts by their genres 
#2- divide each genre's score by genretotalcount and get a float number
#3- reverse all the float number scores and this score is each genre's score
#5- say if movie has 2 genres, drama and romance. sum romance and drama score and multiply by total count. this is how much 
#this movie should be repeated

#we need three files:
#1- item data: movieid | year | average rating(movie's) | 14 genres
#2- user data: userid | rating count | rating average(user's) | 14 genres
#3- true ratings of movies (y)

#to get these files it is easier to work with a dataset that has all the columns


In [53]:
#first we have to find how much rating each genre has
#movies = movieid | title("movie name" ("movie year")) | genres(seperated by "|") we have to one hot encode genres and seperate the year
#from the title

In [54]:
#checking for how many movies there are
print(ratings['movieId'].nunique())
print(movies['movieId'].nunique())

9724
9742


In [55]:
##dropping first 5 users ratings from ratings dataset for debugging purposes
#first_5_users_df = ratings.head(560)
#ratings = ratings.drop(ratings.index[:560])


In [56]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [57]:
#we have to one hot encode genres
genres = movies['genres'].str.get_dummies('|')
#dropping some columns
genres.drop(columns=['(no genres listed)', 'Film-Noir', 'IMAX', 'Musical', 'Western', 'War'] ,axis=1, inplace=True)

In [58]:
#merging one hot encoded genres with movies dataframe
movies = pd.concat([movies, genres], axis=1)
movies.drop(columns='genres', axis=1, inplace=True)
movies

Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,1,Toy Story (1995),0,1,1,1,1,0,0,0,1,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,1,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,0,0,0,1,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,0,0,0,1,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),1,0,1,0,1,0,0,0,1,0,0,0,0,0
9738,193583,No Game No Life: Zero (2017),0,0,1,0,1,0,0,0,1,0,0,0,0,0
9739,193585,Flint (2017),0,0,0,0,0,0,0,1,0,0,0,0,0,0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),1,0,1,0,0,0,0,0,0,0,0,0,0,0


In [59]:
#extarct year from its title
pattern = '\((\d{4})\)'
movies['year'] = movies.title.str.extract(pattern, expand=False)


In [60]:
#finding movieid's has null values
null_ids = movies[movies['year'].isnull()]['movieId']
null_ids = pd.DataFrame(null_ids)
#dataframe  has null movieids
null_ids
null_id_array = null_ids['movieId'].values
#array has null movieids
null_id_array

array([ 40697, 140956, 143410, 147250, 149334, 156605, 162414, 167570,
       171495, 171631, 171749, 171891, 176601], dtype=int64)

In [61]:
#we got the years of the movies manually from internet and we need to merge it with movies dataset
years_of_null_movies = ["1994", "2018", "2015", "1979", "2016", "2016", "2016", "2016", "2019", "2017", "2006", "2017", "2011"]
null_ids['year'] = years_of_null_movies

null_ids

Unnamed: 0,movieId,year
6059,40697,1994
9031,140956,2018
9091,143410,2015
9138,147250,1979
9179,149334,2016
9259,156605,2016
9367,162414,2016
9448,167570,2016
9514,171495,2019
9515,171631,2017


In [62]:
#filled movies df's year columns with the values that we found manually
movies['year'] = movies['year'].fillna(null_ids['year'])

In [63]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [64]:
#gerelerı ratings dfle birleştir, groupbylı bi şeylerle topla
#genre countları bukl

In [65]:
ratings.drop(columns='timestamp', inplace=True, axis=1)

In [66]:
#reindex year column
movies = movies.reindex(columns=['movieId', 'title', 'year', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Horror',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller'])
movies

Unnamed: 0,movieId,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,1,Toy Story (1995),1995,0,1,1,1,1,0,0,0,1,0,0,0,0,0
1,2,Jumanji (1995),1995,0,1,0,1,0,0,0,0,1,0,0,0,0,0
2,3,Grumpier Old Men (1995),1995,0,0,0,0,1,0,0,0,0,0,0,1,0,0
3,4,Waiting to Exhale (1995),1995,0,0,0,0,1,0,0,1,0,0,0,1,0,0
4,5,Father of the Bride Part II (1995),1995,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),2017,1,0,1,0,1,0,0,0,1,0,0,0,0,0
9738,193583,No Game No Life: Zero (2017),2017,0,0,1,0,1,0,0,0,1,0,0,0,0,0
9739,193585,Flint (2017),2017,0,0,0,0,0,0,0,1,0,0,0,0,0,0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),2018,1,0,1,0,0,0,0,0,0,0,0,0,0,0


In [67]:
#merge rating df with their genres
merged_ratings = pd.merge(ratings, movies, on='movieId', how='outer')
merged_ratings

Unnamed: 0,userId,movieId,rating,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,1.0,1,4.0,Toy Story (1995),1995,0,1,1,1,1,0,0,0,1,0,0,0,0,0
1,5.0,1,4.0,Toy Story (1995),1995,0,1,1,1,1,0,0,0,1,0,0,0,0,0
2,7.0,1,4.5,Toy Story (1995),1995,0,1,1,1,1,0,0,0,1,0,0,0,0,0
3,15.0,1,2.5,Toy Story (1995),1995,0,1,1,1,1,0,0,0,1,0,0,0,0,0
4,17.0,1,4.5,Toy Story (1995),1995,0,1,1,1,1,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100849,,30892,,In the Realms of the Unreal (2004),2004,0,0,1,0,0,0,1,0,0,0,0,0,0,0
100850,,32160,,Twentieth Century (1934),1934,0,0,0,0,1,0,0,0,0,0,0,0,0,0
100851,,32371,,Call Northside 777 (1948),1948,0,0,0,0,0,1,0,1,0,0,0,0,0,0
100852,,34482,,"Browning Version, The (1951)",1951,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [68]:
#drop unrated movies
merged_ratings = merged_ratings.drop(labels=range(100836, 100854), axis=0)
merged_ratings

Unnamed: 0,userId,movieId,rating,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,1.0,1,4.0,Toy Story (1995),1995,0,1,1,1,1,0,0,0,1,0,0,0,0,0
1,5.0,1,4.0,Toy Story (1995),1995,0,1,1,1,1,0,0,0,1,0,0,0,0,0
2,7.0,1,4.5,Toy Story (1995),1995,0,1,1,1,1,0,0,0,1,0,0,0,0,0
3,15.0,1,2.5,Toy Story (1995),1995,0,1,1,1,1,0,0,0,1,0,0,0,0,0
4,17.0,1,4.5,Toy Story (1995),1995,0,1,1,1,1,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610.0,160341,2.5,Bloodmoon (1997),1997,1,0,0,0,0,0,0,0,0,0,0,0,0,1
100832,610.0,160527,4.5,Sympathy for the Underdog (1971),1971,1,0,0,0,0,1,0,1,0,0,0,0,0,0
100833,610.0,160836,3.0,Hazard (2005),2005,1,0,0,0,0,0,0,1,0,0,0,0,0,1
100834,610.0,163937,3.5,Blair Witch (2016),2016,0,0,0,0,0,0,0,0,0,1,0,0,0,1


In [69]:
#find each genre's count
genres = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller']

genres_dict = {}

for i in genres:
    genres_dict[i] = merged_ratings[i].sum()

genres_dict = {k: v for k, v in sorted(genres_dict.items(), key=lambda item: item[1])}
genres_dict

{'Documentary': 1219,
 'Animation': 6988,
 'Horror': 7291,
 'Mystery': 7674,
 'Children': 9208,
 'Fantasy': 11834,
 'Crime': 16681,
 'Sci-Fi': 17243,
 'Romance': 18124,
 'Adventure': 24161,
 'Thriller': 26452,
 'Action': 30635,
 'Comedy': 39053,
 'Drama': 41928}

In [70]:
#count of each genres repeated is 258491 (genretotalcount)
#rating count is 100837

genre_scores = {}

for i in genres_dict.keys():
    genre_scores[i] = genres_dict[i] / 258491

genre_scores = {k: v for k, v in sorted(genre_scores.items(), key=lambda item: item[1], reverse=True)}
genre_scores


{'Drama': 0.16220293936732808,
 'Comedy': 0.15108069526598605,
 'Action': 0.11851476453725662,
 'Thriller': 0.10233238294563447,
 'Adventure': 0.09346940512435636,
 'Romance': 0.0701146268148601,
 'Sci-Fi': 0.0667063843615445,
 'Crime': 0.06453222742764739,
 'Fantasy': 0.045781091024445726,
 'Children': 0.03562212997744602,
 'Mystery': 0.029687687385634317,
 'Horror': 0.028206011041003363,
 'Animation': 0.027033823227888013,
 'Documentary': 0.0047158314989690165}

In [71]:
sorted_values = sorted(genre_scores.values())

new_dict = {k: v for k, v in zip(genre_scores.keys(), sorted_values)}

new_dict

{'Drama': 0.0047158314989690165,
 'Comedy': 0.027033823227888013,
 'Action': 0.028206011041003363,
 'Thriller': 0.029687687385634317,
 'Adventure': 0.03562212997744602,
 'Romance': 0.045781091024445726,
 'Sci-Fi': 0.06453222742764739,
 'Crime': 0.0667063843615445,
 'Fantasy': 0.0701146268148601,
 'Children': 0.09346940512435636,
 'Mystery': 0.10233238294563447,
 'Horror': 0.11851476453725662,
 'Animation': 0.15108069526598605,
 'Documentary': 0.16220293936732808}

In [72]:
genre_score_df = pd.DataFrame.from_dict(new_dict, orient='index')
#genre_score_df_new = genre_score_df.reset_index()
genre_score_df

Unnamed: 0,0
Drama,0.004716
Comedy,0.027034
Action,0.028206
Thriller,0.029688
Adventure,0.035622
Romance,0.045781
Sci-Fi,0.064532
Crime,0.066706
Fantasy,0.070115
Children,0.093469


In [73]:
genre_score_df = genre_score_df.transpose()

In [74]:
#assign each genres score to the one hot encoded genres df

for i in genre_score_df.columns:
    movies[i].replace(1, genre_score_df[i][0], inplace=True)

movies

Unnamed: 0,movieId,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,1,Toy Story (1995),1995,0.000000,0.035622,0.151081,0.093469,0.027034,0.0,0.0,0.000000,0.070115,0.0,0.0,0.000000,0.0,0.0
1,2,Jumanji (1995),1995,0.000000,0.035622,0.000000,0.093469,0.000000,0.0,0.0,0.000000,0.070115,0.0,0.0,0.000000,0.0,0.0
2,3,Grumpier Old Men (1995),1995,0.000000,0.000000,0.000000,0.000000,0.027034,0.0,0.0,0.000000,0.000000,0.0,0.0,0.045781,0.0,0.0
3,4,Waiting to Exhale (1995),1995,0.000000,0.000000,0.000000,0.000000,0.027034,0.0,0.0,0.004716,0.000000,0.0,0.0,0.045781,0.0,0.0
4,5,Father of the Bride Part II (1995),1995,0.000000,0.000000,0.000000,0.000000,0.027034,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),2017,0.028206,0.000000,0.151081,0.000000,0.027034,0.0,0.0,0.000000,0.070115,0.0,0.0,0.000000,0.0,0.0
9738,193583,No Game No Life: Zero (2017),2017,0.000000,0.000000,0.151081,0.000000,0.027034,0.0,0.0,0.000000,0.070115,0.0,0.0,0.000000,0.0,0.0
9739,193585,Flint (2017),2017,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.004716,0.000000,0.0,0.0,0.000000,0.0,0.0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),2018,0.028206,0.000000,0.151081,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0


In [75]:
#now we have to sum all the scores each movie has
movies['movie_score'] = movies.iloc[:,3:].sum(axis=1)


In [76]:

movies.sort_values(by=['movie_score']).head(85)#81 tane film gidicek bb movie score = 0
#sadece sildiğimiz genrelara sahip olan filmlere ne yapıcaz?
#tekrar sayısı çok yüksek?????
    #genre sayısına bölebiliriz
    #1000' gibi bi şeye bölebiliriz
    #başka bi şeyler?

Unnamed: 0,movieId,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller,movie_score
700,918,Meet Me in St. Louis (1944),1944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3828,5372,Calamity Jane (1953),1953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
8782,129250,Superfast! (2015),2015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
178,210,Wild Bill (1995),1995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4633,6911,"Jolson Story, The (1946)",1946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,416,Bad Girls (1994),1994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4862,7265,"Dreamers, The (2003)",2003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004716,0.0,0.0,0.0,0.0,0.0,0.0,0.004716
4860,7263,Miracle (2004),2004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004716,0.0,0.0,0.0,0.0,0.0,0.0,0.004716
1278,1695,Artemisia (1997),1997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004716,0.0,0.0,0.0,0.0,0.0,0.0,0.004716


In [77]:
movies = movies.sort_values(by='movie_score')[81:]
movies.sort_values(by='movieId', inplace=True)

In [78]:
ratings.sort_values(by='movieId', inplace=True)

In [79]:
movie_rating_count = ratings[['movieId','rating']].groupby('movieId').count()
movie_rating_mean = ratings[['movieId','rating']].groupby('movieId').mean()

In [80]:
movie_analysis = movie_rating_count.merge(movie_rating_mean, on='movieId', how='inner')
movie_analysis.rename(columns={'rating_x':'rating_count'}, inplace=True)
movie_analysis.rename(columns={'rating_y':'avg_rating'}, inplace=True)
movie_analysis


Unnamed: 0_level_0,rating_count,avg_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,215,3.920930
2,110,3.431818
3,52,3.259615
4,7,2.357143
5,49,3.071429
...,...,...
193581,1,4.000000
193583,1,3.500000
193585,1,3.500000
193587,1,3.500000


In [81]:
movie_analysis = movie_analysis[movie_analysis['rating_count'] > 30]
movie_analysis = movie_analysis.reset_index()
movie_analysis

Unnamed: 0,movieId,rating_count,avg_rating
0,1,215,3.920930
1,2,110,3.431818
2,3,52,3.259615
3,5,49,3.071429
4,6,102,3.946078
...,...,...,...
855,122904,54,3.833333
856,134130,48,4.000000
857,134853,43,3.813953
858,139385,31,3.903226


In [82]:
#merge movies with movie_analysis
movie_analysis = movie_analysis.merge(movies,on='movieId')

In [83]:
#movie score filmin içerdiği genre sayısına bölerek fazla genre sayısına sahip filmlerin skorunu dengeledik
counter = 0

for index, row in movie_analysis.iterrows():
    for i in range(5,19):
        if row[i] > 0:
            counter +=1
    movie_analysis['movie_score'][index] = row[19] / counter
    counter = 0
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_analysis['movie_score'][index] = row[19] / counter
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_analysis['movie_score'][index] = row[19] / counter
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_analysis['movie_score'][index] = row[19] / counter
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [84]:
movie_analysis = movie_analysis.assign(repeatCount=lambda x: (x.movie_score  * 57358 / 500) + x.rating_count)
movie_analysis = movie_analysis.astype({"repeatCount": int})
movie_analysis


Unnamed: 0,movieId,rating_count,avg_rating,title,year,Action,Adventure,Animation,Children,Comedy,...,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller,movie_score,repeatCount
0,1,215,3.920930,Toy Story (1995),1995,0.000000,0.035622,0.151081,0.093469,0.027034,...,0.0,0.000000,0.070115,0.0,0.0,0.000000,0.000000,0.000000,0.075464,223
1,2,110,3.431818,Jumanji (1995),1995,0.000000,0.035622,0.000000,0.093469,0.000000,...,0.0,0.000000,0.070115,0.0,0.0,0.000000,0.000000,0.000000,0.066402,117
2,3,52,3.259615,Grumpier Old Men (1995),1995,0.000000,0.000000,0.000000,0.000000,0.027034,...,0.0,0.000000,0.000000,0.0,0.0,0.045781,0.000000,0.000000,0.036407,56
3,5,49,3.071429,Father of the Bride Part II (1995),1995,0.000000,0.000000,0.000000,0.000000,0.027034,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.027034,52
4,6,102,3.946078,Heat (1995),1995,0.028206,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.029688,0.041533,106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
855,122904,54,3.833333,Deadpool (2016),2016,0.028206,0.035622,0.000000,0.000000,0.027034,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.064532,0.000000,0.038849,58
856,134130,48,4.000000,The Martian (2015),2015,0.000000,0.035622,0.000000,0.000000,0.000000,...,0.0,0.004716,0.000000,0.0,0.0,0.000000,0.064532,0.000000,0.034957,52
857,134853,43,3.813953,Inside Out (2015),2015,0.000000,0.035622,0.151081,0.093469,0.027034,...,0.0,0.004716,0.070115,0.0,0.0,0.000000,0.000000,0.000000,0.063673,50
858,139385,31,3.903226,The Revenant (2015),2015,0.000000,0.035622,0.000000,0.000000,0.000000,...,0.0,0.004716,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.020169,33


In [86]:
merge_for_rating = movie_analysis[['movieId','repeatCount']]

In [87]:
merge_for_rating

Unnamed: 0,movieId,repeatCount
0,1,223
1,2,117
2,3,56
3,5,52
4,6,106
...,...,...
855,122904,58
856,134130,52
857,134853,50
858,139385,33


In [88]:
ratings = ratings.merge(merge_for_rating, on='movieId')
ratings

Unnamed: 0,userId,movieId,rating,repeatCount
0,1,1,4.0,223
1,517,1,4.0,223
2,213,1,3.5,223
3,514,1,4.0,223
4,214,1,3.0,223
...,...,...,...,...
57353,525,152081,3.0,39
57354,21,152081,4.0,39
57355,114,152081,3.5,39
57356,596,152081,4.0,39


In [89]:
ratings.sort_values(by='userId')

Unnamed: 0,userId,movieId,rating,repeatCount
0,1,1,4.0,223
28887,1,2018,5.0,47
18635,1,1136,5.0,141
8523,1,362,5.0,40
29070,1,2028,4.0,189
...,...,...,...,...
16408,610,912,3.5,102
39747,610,3977,2.0,75
18349,610,1097,4.5,128
53730,610,60684,3.5,48


In [47]:
user_rating_count = ratings[['userId','rating']].groupby('userId').count()
user_rating_mean = ratings[['userId','rating']].groupby('userId').mean()

In [50]:
user_analysis = user_rating_count.merge(user_rating_mean, on='userId', how='inner')
user_analysis.rename(columns={'rating_x':'rating_count'}, inplace=True)
user_analysis.rename(columns={'rating_y':'avg_rating'}, inplace=True)
user_analysis


Unnamed: 0_level_0,rating_count,avg_rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,157,4.401274
2,19,3.868421
3,19,1.736842
4,116,3.431034
5,37,3.540541
...,...,...
606,363,3.663912
607,126,3.730159
608,479,3.339248
609,25,3.280000


In [90]:
ratings = ratings.merge(user_analysis, on='userId', how='left')
ratings

Unnamed: 0,userId,movieId,rating,repeatCount,rating_count,avg_rating
0,1,1,4.0,223,157,4.401274
1,517,1,4.0,223,220,2.550000
2,213,1,3.5,223,47,4.021277
3,514,1,4.0,223,196,3.767857
4,214,1,3.0,223,17,2.823529
...,...,...,...,...,...,...
57353,525,152081,3.0,39,312,3.639423
57354,21,152081,4.0,39,226,3.398230
57355,114,152081,3.5,39,27,3.481481
57356,596,152081,4.0,39,249,3.483936


In [92]:
ratings = ratings.loc[ratings.index.repeat(ratings['repeatCount'])]
ratings

Unnamed: 0,userId,movieId,rating,repeatCount,rating_count,avg_rating
0,1,1,4.0,223,157,4.401274
0,1,1,4.0,223,157,4.401274
0,1,1,4.0,223,157,4.401274
0,1,1,4.0,223,157,4.401274
0,1,1,4.0,223,157,4.401274
...,...,...,...,...,...,...
57357,401,152081,4.0,39,48,3.458333
57357,401,152081,4.0,39,48,3.458333
57357,401,152081,4.0,39,48,3.458333
57357,401,152081,4.0,39,48,3.458333


In [None]:
#user df için her genreya verdiği ortalama ratingi bulacağız.