In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import mean_squared_error as mse
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import linear_model

In [None]:
Movie_Genome = np.load('./MovieGenome.npy') #Here, MovieGenome.npy is the matrix of genome scores of all attributes.
num_users = 9970
num_movies = 9998
pca_length = 512
gen_len = 19
clusters = 10

In [None]:
kmeans = KMeans( n_clusters = clusters, random_state = 0 ) # Clustering the movies into 10 clusters.
kmeans.fit( Movie_Genome )
labels = kmeans.labels_

In [None]:
Master = StandardScaler().fit_transform( Movie_Genome )  # Applying PCA and reducing feature dimension to 512.
pca = PCA( n_components = pca_length )
Master = pca.fit_transform( Master )
print( Master.shape )

In [None]:
train = np.asarray( pd.read_csv('./train.csv'))

set_movies = set( train[ : , 1 ].astype( int ))

In [None]:
X_train = np.ones(( 1, pca_length ))   # Numpy Array of movie genome for movies in training dataset

for j in tqdm(set_movies):
    X_train = np.vstack(( X_train, Master[ j ]))

X_train = X_train[ 1 : , : ]

In [None]:
Y_train = np.zeros(( len( X_train ), 1 ))   # Average Rating for training 
counter = 0

for j in tqdm( set_movies ):
    temp = train[ np.where( train[ : , 1 ] == j )]
    Y_train[ counter ] = np.mean( temp[ : , 2 ] )
    counter += 1

In [None]:
reg = linear_model.LinearRegression()
reg.fit( X_train, Y_train )

In [None]:
C = reg.predict( Master )
B = np.zeros((num_users,clusters))

In [None]:
temp = np.sum((Movie_Genome**2),axis = 1)
print(temp.shape)
temp_movie_0 = []

for j in range(len(Movie_Genome)):
    
    if temp[j] == 0:
        temp_movie_0.append(j)

mean = np.mean(train[:,2])
for j in tqdm(temp_movie_0):
    temp_movie = train[np.where(train[:,1] == j)]
    if len(temp_movie):
        C[j] = np.mean(temp_movie[:,2])
    else:
        C[j] = mean

In [None]:
ita = 0.01  # ita = 0.01 till training error reduces significantly , then we used ita = 0.005 and later ita = 0.001
batch_size = 1
num_batches = ( len( train ) // batch_size )

In [None]:
for epoch in  range( 10 ):
    for i in tqdm( range( num_batches )):
        # Splitting data into batches
        start = i * batch_size
        if ( i == num_batches - 1 ):
            batch = train[ start : ]
        else:
            batch = train[ start : start + batch_size ]
        
        R = batch[ : , -1 ]
        b = np.zeros_like( R )
        
        for j in range( len( batch )):
            c = batch[ j, 1 ].astype( int )
            b[ j ] = B[ int( batch[ j, 0 ] ), labels[ c ] ] + C[ int( batch[ j, 1 ] ) ]
        
        for j in range( len( batch )):
            c = batch[ j, 1 ].astype( int )
            B[ int( batch[ j, 0 ] ), labels[ c ] ] += ita * ( R[ j ] - b[ j ] )
    
    true = train[ : , -1 ]
    scores = np.zeros_like( true )
    
    for i in range( len( train )):
        c = train[ i, 1 ].astype( int )
        scores[ i ] = B[ int( train[ i, 0 ]), labels[ c ] ] + C[ int( train[ i, 1 ]) ]
    print( mse( true, scores ))

In [None]:
np.save("Users_2404_2_2",B)

In [None]:
asum = np.zeros( num_users, dtype = int )

for i in tqdm( range( num_users )):
    asum[ i ] = np.sum( B[ i, : ] == 0 )
    if ( asum[ i ] != clusters ) and ( asum[ i ] > 0 ):
        bsum = np.sum( B[ i, : ] )
        bmean = bsum / ( clusters - asum[ i ] )
        for j in range( clusters ):
            if B[ i, j ] == 0:
                B[ i, j ] = bmean

sums = np.sum( B, axis = 0 )
means = np.zeros_like( sums )

for j in range( len( means )):
    means[ j ] = sums[ j ] / np.sum( B[ : , j ] != 0 )

for i in range( num_users ):
    if asum[ i ] == clusters:
        B[ i, : ] = means[ : ]

In [None]:
test = np.asarray(pd.read_csv('./test.csv'))
predict_test = np.zeros( len( test ))

In [None]:
for j in tqdm( range( len( test ))):
    c = test[ j, 1 ].astype( int )
    predict_test[ j ] = C[ c ] + B[ int( test[ j, 0 ]), labels[ c ] ]

In [None]:
print( min( predict_test ), max( predict_test ))
print( np.sum( predict_test > 10 ))

In [None]:
predict_test[ np.where( predict_test > 5.0 )] = 5.0
predict_test[ np.where( predict_test < 0.5 )] = 0.5

predict_test = np.around( predict_test, decimals = 1 )

In [None]:
id1 = np.arange( len( test ), dtype = int )

mysubmission = pd.DataFrame( { 'Id': id1, 'Prediction' : predict_test }, index = id1 )
mysubmission.columns = [ 'Id', 'Prediction' ]
mysubmission.head()

In [None]:
mysubmission.describe()

In [None]:
mysubmission.to_csv(f'Submission.csv', index = False )