In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random as rnd

#### Load data

In [2]:
train_path = '../rating_train.csv'
user_path = '../users.csv'
movie_path = '../movies.csv'
test_path = "../rating_test.csv"

In [3]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

### Merge other .csv data into one dataframe

In [4]:
user = pd.read_csv(user_path)
movie = pd.read_csv(movie_path)

In [5]:
def merge(df):  
    """
    Merge three dataframe into one 
    """
    df_user = (
                df.set_index("UserID").join(user.set_index("UserID"),how="left")
            ).reset_index()
        
    df_movie = (
                df.set_index("MovieID").join(movie.set_index("MovieID"), how="left")
                ).reset_index()
    

    
    df = pd.merge(df_user,movie, how='left', on='MovieID')
    
    return df

In [6]:
train = merge(train)
test = merge(test)
train.head()

Unnamed: 0,UserID,MovieID,timestamps,Rating,Gender,Age,OccupationID,Zip-code,Title,Genres
0,1,1836,978300172,5,F,1,10,48067,"Last Days of Disco, The (1998)",Drama
1,1,1097,978301953,4,F,1,10,48067,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi
2,1,2028,978301619,5,F,1,10,48067,Saving Private Ryan (1998),Action|Drama|War
3,1,527,978824195,5,F,1,10,48067,Schindler's List (1993),Drama|War
4,1,2918,978302124,4,F,1,10,48067,Ferris Bueller's Day Off (1986),Comedy


#### Feature Engineering

In [7]:
def get_movie_year(title):
    return title[-5:-1]

def map_age(age):
    if age == 1:
        return 0
    elif age == 18:
        return 1
    elif age == 25:
        return 2
    elif age == 35:
        return 3
    elif age == 45:
        return 4
    elif age == 50:
        return 5
    else:
        return 6
train['movie_year'] =train['Title'].apply(get_movie_year)
test['movie_year'] =test['Title'].apply(get_movie_year)

train['Age'] = train['Age'].map(map_age)
test['Age'] = test['Age'].map(map_age)

#### Drop column

In [8]:
drop = ['Zip-code','timestamps','Title']
train = train.drop(drop,axis=1)
test = test.drop(drop,axis=1)

In [9]:
train.head()

Unnamed: 0,UserID,MovieID,Rating,Gender,Age,OccupationID,Genres,movie_year
0,1,1836,5,F,0,10,Drama,1998
1,1,1097,4,F,0,10,Children's|Drama|Fantasy|Sci-Fi,1982
2,1,2028,5,F,0,10,Action|Drama|War,1998
3,1,527,5,F,0,10,Drama|War,1993
4,1,2918,4,F,0,10,Comedy,1986


#### Model

In [10]:
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(train[['UserID','MovieID','Rating']], reader)

# Use the famous SVD algorithm.
algo = SVD(n_factors=200,n_epochs=40)

# Run 2-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=2, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 2 split(s).

                  Fold 1  Fold 2  Mean    Std     
RMSE (testset)    0.9353  0.9356  0.9355  0.0002  
MAE (testset)     0.7361  0.7362  0.7362  0.0000  
Fit time          40.08   40.74   40.41   0.33    
Test time         1.81    1.51    1.66    0.15    


{'test_rmse': array([0.93529072, 0.93561081]),
 'test_mae': array([0.73614557, 0.73623391]),
 'fit_time': (40.07932376861572, 40.73696708679199),
 'test_time': (1.8095588684082031, 1.5074470043182373)}

In [11]:
### Make Prediction on test data
algo = SVD(n_factors=200,n_epochs=40)
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x15a32a370>

In [12]:
pred = []
for i in test.index:
    uid = test.iloc[i]['UserID']
    mid = test.iloc[i]['MovieID']
    pred.append(algo.predict(uid,mid).est)

In [13]:
result = test[['UserID','MovieID']]
result['Rating'] = pred
result.to_csv("../Q5_output/Q5_output.csv",index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Rating'] = pred
