In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

In [124]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [125]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [127]:
movies['year'] = movies['title'].str.extract('(\d{4})')
movies['title'] = movies['title'].apply(lambda x: x[:-7])

In [132]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
 3   year     9742 non-null   int32 
dtypes: int32(1), int64(1), object(2)
memory usage: 266.5+ KB


In [131]:
movies.year.fillna(0, inplace=True)
movies['year'] = movies.year.astype('int')

In [133]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

movies['genres'] = movies['genres'].apply(change_string)

In [134]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure Animation Children Comedy Fantasy,1995
1,2,Jumanji,Adventure Children Fantasy,1995
2,3,Grumpier Old Men,Comedy Romance,1995
3,4,Waiting to Exhale,Comedy Drama Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [135]:
ratings = ratings.groupby('movieId').mean()[['rating']]
ratings.head()

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,3.92093
2,3.431818
3,3.259615
4,2.357143
5,3.071429


In [136]:
df = movies.join(ratings, on='movieId')
df.head()

Unnamed: 0,movieId,title,genres,year,rating
0,1,Toy Story,Adventure Animation Children Comedy Fantasy,1995,3.92093
1,2,Jumanji,Adventure Children Fantasy,1995,3.431818
2,3,Grumpier Old Men,Comedy Romance,1995,3.259615
3,4,Waiting to Exhale,Comedy Drama Romance,1995,2.357143
4,5,Father of the Bride Part II,Comedy,1995,3.071429


In [137]:
tags = tags.groupby('movieId').agg(lambda x: x.values.tolist())
tags['tags'] = tags['tag'].apply(lambda x: ','.join(x).replace(' ', '').replace('-', '').replace(',', ' ').lower())
tags = tags[['tags']]
tags.head()

Unnamed: 0_level_0,tags
movieId,Unnamed: 1_level_1
1,pixar pixar fun
2,fantasy magicboardgame robinwilliams game
3,moldy old
5,pregnancy remake
7,remake


In [138]:
df = df.join(tags, on='movieId')

In [140]:
# df['features']= df["genres"].astype(str) +" "+ df["tags"]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   title    9742 non-null   object 
 2   genres   9742 non-null   object 
 3   year     9742 non-null   int32  
 4   rating   9724 non-null   float64
 5   tags     1572 non-null   object 
dtypes: float64(1), int32(1), int64(1), object(3)
memory usage: 418.7+ KB


In [141]:
from sklearn.feature_extraction.text import TfidfVectorizer

v = TfidfVectorizer()
x = v.fit_transform(df['genres'])

df1 = pd.DataFrame(x.toarray(), columns=v.get_feature_names())
df1.head()

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [142]:
df['tags'].fillna('notags', inplace=True)

In [143]:
x = v.fit_transform(df['tags'])

df2 = pd.DataFrame(x.toarray(), columns=v.get_feature_names())
df2.head()

Unnamed: 0,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,2danimation,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [144]:
data = pd.concat([df, df1, df2], axis = 1).drop(['title', 'genres', 'tags'], axis = 1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Columns: 1496 entries, movieId to zooeydeschanel
dtypes: float64(1494), int32(1), int64(1)
memory usage: 111.2 MB


In [148]:
X = data.drop(['rating'], axis = 1)
Y = data.rating.fillna(0)

In [149]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [156]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(max_depth=10)
model.fit(X_train, y_train)
pred = model.predict(X_test)

In [157]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, pred)

0.6732197681893046

In [158]:
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor()
model.fit(X_train, y_train)
pred = model.predict(X_test)
mean_squared_error(y_test, pred)

0.677032293739654