In [2]:
import pandas as pd
import zipfile,io
import os
import shutil
import requests
import numpy as np

In [3]:
#Download the file
url = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"

In [3]:
# make directory:
try:
    if not os.path.exists('rawdata'):
        os.makedirs('rawdata', mode=0o777)
    else:
        shutil.rmtree(os.path.join(os.path.dirname(__file__),'rawdata'),ignore_errors=False)
        os.makedirs('rawdata', mode=0o777)
except Exception as e:
    exit()

In [1]:
# Download zipfile
try:
    rzip = requests.get(url)
    zf = zipfile.ZipFile(io.BytesIO(rzip.content))
    zf.extractall('rawdata')
except Exception as e:
    print(e)

In [6]:
#Detecting Anomalies
#Handling Missing Values
df_links = pd.read_csv('rawdata/ml-latest-small/links.csv')

In [7]:
df_links.isnull().sum()

movieId     0
imdbId      0
tmdbId     13
dtype: int64

In [8]:
df_movies = pd.read_csv('rawdata/ml-latest-small/movies.csv')

In [9]:
df_movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [10]:
df_ratings = pd.read_csv('rawdata/ml-latest-small/ratings.csv')

In [11]:
df_ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [12]:
df_tags = pd.read_csv('rawdata/ml-latest-small/tags.csv')

In [13]:
df_tags.isnull().sum()

userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

In [14]:
df_links[pd.isnull(df_links).any(axis=1)]

Unnamed: 0,movieId,imdbId,tmdbId
607,720,118114,
608,721,114103,
640,769,116992,
910,1133,111357,
2282,2851,81454,
3246,4051,56600,
5826,26587,92337,
7408,72781,1104746,
7438,73759,495212,
7588,79299,874957,


In [15]:
df_tags[pd.isnull(df_tags).any(axis=1)]

Unnamed: 0,userId,movieId,tag,timestamp


In [16]:
df_links.dropna(inplace=True)

In [17]:
df_tags.dropna(inplace=True)

In [18]:
df_links.isnull().sum()

movieId    0
imdbId     0
tmdbId     0
dtype: int64

In [19]:
df_tags.isnull().sum()

userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

In [20]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [21]:
# Splitting  title and year
df_movies['year'] = df_movies.title.str.extract("\((\d{4})\)")

  


In [22]:
genres_unique = pd.DataFrame(df_movies.genres.str.split('|').tolist()).stack().unique()

In [23]:
genres_unique

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'Documentary', 'IMAX', 'War', 'Musical',
       'Western', 'Film-Noir', '(no genres listed)'], dtype=object)

In [24]:
df_movies = df_movies.join(df_movies.genres.str.get_dummies().astype(int))

In [25]:
df_movies.drop('genres',inplace=True,axis=1)

In [26]:
df_movies.head()

Unnamed: 0,movieId,title,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),1995,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),1995,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),1995,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),1995,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [28]:
df_ratings.timestamp = pd.to_datetime(df_ratings.timestamp, infer_datetime_format=True)

In [29]:
df_ratings.timestamp = df_ratings.timestamp.dt.year

In [30]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1970
1,1,1029,3.0,1970
2,1,1061,3.0,1970
3,1,1129,2.0,1970
4,1,1172,4.0,1970


In [32]:
df_tags.isnull().sum()

userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

In [33]:
df_links.isnull().sum()

movieId    0
imdbId     0
tmdbId     0
dtype: int64

In [34]:
df_movies['year'].fillna(0,inplace=True)

In [35]:
df_movies['year'] = df_movies['year'].astype(int)

In [36]:
df_movies.isnull().sum()

movieId               0
title                 0
year                  0
(no genres listed)    0
Action                0
Adventure             0
Animation             0
Children              0
Comedy                0
Crime                 0
Documentary           0
Drama                 0
Fantasy               0
Film-Noir             0
Horror                0
IMAX                  0
Musical               0
Mystery               0
Romance               0
Sci-Fi                0
Thriller              0
War                   0
Western               0
dtype: int64

In [37]:
df_movies.sort_values(by='movieId', inplace=True)

In [38]:
df_ratings.sort_values(by='movieId', inplace=True)

In [39]:
# zip the csvs and log files
def zipdir(path,ziph):
    #ziph.write(os.path.join('cleansed_df_genome_scores.csv'))
    #ziph.write(os.path.join('cleansed_df_genome_tags.csv'))
    ziph.write(os.path.join('cleansed_df_tags.csv'))
    ziph.write(os.path.join('cleansed_df_links.csv'))
    ziph.write(os.path.join('cleansed_df_movies.csv'))
    ziph.write(os.path.join('cleansed_df_ratings.csv'))

In [47]:
#df_genome_scores.to_csv('cleansed_df_genome_scores.csv',index = False)
#print('1')
#df_genome_tags.to_csv('cleansed_df_genome_tags.csv',index = False)
#print('2')
df_tags.to_csv('cleansed_df_tags.csv',index = False)
print('3')
df_links.to_csv('cleansed_df_links.csv',index = False)
print('4')
df_movies.to_csv('cleansed_df_movies.csv',index = False)
print('5')
df_ratings.to_csv('cleansed_df_ratings.csv',index = False)
print('6')

3
4
5
6


In [48]:
zipf = zipfile.ZipFile('CleansedData.zip','w',zipfile.ZIP_DEFLATED)
zipdir('/',zipf)
zipf.close()

In [40]:
## Distinct Users and Movies 
n_users = df_ratings.userId.unique().shape[0]
n_movies = df_ratings.movieId.unique().shape[0]

In [41]:
print(str(n_users) + ' users')

671 users


In [42]:
print(str(n_movies) + ' movies')

9066 movies


In [43]:
df_ratings_try = df_ratings

In [44]:
#adjusted ratings
Mean = df_ratings.groupby(['userId'],as_index=False,sort=False).mean().rename(columns = {'rating':'rating_mean'})

In [45]:
Mean.head(2)

Unnamed: 0,userId,movieId,rating_mean,timestamp
0,68,7169.325203,3.626016,1970.0
1,261,2092.3,3.63,1970.0


In [46]:
df_ratings_try1 = pd.merge(df_ratings,Mean[['userId','rating_mean']],on='userId')

In [47]:
df_ratings_try1.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,rating_mean
0,68,1,4.0,1970,3.626016
1,68,2,3.0,1970,3.626016


In [48]:
df_ratings_try1['rating_adjusted'] = df_ratings_try1['rating'] - df_ratings_try1['rating_mean']

In [49]:
df_ratings_try1.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,rating_mean,rating_adjusted
0,68,1,4.0,1970,3.626016,0.373984
1,68,2,3.0,1970,3.626016,-0.626016


In [50]:
df_ratings_try2 = df_ratings_try1[['userId','movieId','rating_adjusted','timestamp']]

In [51]:
df_ratings_try3 = df_ratings_try2[['userId','rating_adjusted']]

In [52]:
df_ratings_try3.set_index('userId',inplace=True)

In [53]:
df_ratings_try3.head(2) 

Unnamed: 0_level_0,rating_adjusted
userId,Unnamed: 1_level_1
68,0.373984
68,-0.626016


In [54]:
from sklearn.preprocessing import MinMaxScaler

In [55]:
scaler = MinMaxScaler()

In [56]:
df_ratings_try3_scaled= pd.DataFrame(scaler.fit_transform(df_ratings_try3),index=df_ratings_try3.index,columns=df_ratings_try3.columns)

In [57]:
df_ratings_try3_scaled.head(2)

Unnamed: 0_level_0,rating_adjusted
userId,Unnamed: 1_level_1
68,0.581788
68,0.449728


In [58]:
df_ratings_try3_scaled[['rating_adjusted']] = df_ratings_try3_scaled[['rating_adjusted']].apply(lambda x:5*x)

In [59]:
df_ratings_try3_scaled.head(2)

Unnamed: 0_level_0,rating_adjusted
userId,Unnamed: 1_level_1
68,2.90894
68,2.248639


In [60]:
df_ratings_try3_scaled['userId'] = df_ratings_try3_scaled.index

In [61]:
df_ratings_try3_scaled = df_ratings_try3_scaled.reset_index(drop=True)

In [62]:
df_ratings_try3_scaled.head(2)

Unnamed: 0,rating_adjusted,userId
0,2.90894,68
1,2.248639,68


In [63]:
df_ratings_try2 = df_ratings_try2.drop('rating_adjusted',axis=1)

In [64]:
df_ratings_try2.head(3)

Unnamed: 0,userId,movieId,timestamp
0,68,1,1970
1,68,2,1970
2,68,11,1970


In [65]:
df_ratings_try3_scaled.head(3)

Unnamed: 0,rating_adjusted,userId
0,2.90894,68
1,2.248639,68
2,2.578789,68


In [66]:
df_ratings_try2['rating_adjusted'] = df_ratings_try3_scaled['rating_adjusted']

In [67]:
df_ratings_try2.head(3)

Unnamed: 0,userId,movieId,timestamp,rating_adjusted
0,68,1,1970,2.90894
1,68,2,1970,2.248639
2,68,11,1970,2.578789


In [68]:
df_ratings_try2['rating'] = df_ratings_try2['rating_adjusted']

In [69]:
df_ratings_try2 = df_ratings_try2.drop('rating_adjusted',axis=1)

In [70]:
df_ratings_try2.head(3)

Unnamed: 0,userId,movieId,timestamp,rating
0,68,1,1970,2.90894
1,68,2,1970,2.248639
2,68,11,1970,2.578789


In [71]:
# train test split
from sklearn import cross_validation as cv
train_data,test_data = cv.train_test_split(df_ratings_try2,test_size=0.25)



In [72]:
train_data.shape

(75003, 4)

In [73]:
test_data.shape

(25001, 4)

In [74]:
df_ratings_try2.head(3)

Unnamed: 0,userId,movieId,timestamp,rating
0,68,1,1970,2.90894
1,68,2,1970,2.248639
2,68,11,1970,2.578789


In [104]:
df_ratings_try2.to_csv('cleaneddata.csv',index=False)

In [105]:
def zipdir1(path,ziph):
    ziph.write(os.path.join('cleaneddata.csv'))

In [106]:
zipf = zipfile.ZipFile('FurtherData.zip','w',zipfile.ZIP_DEFLATED)
zipdir1('/',zipf)
zipf.close()

## Upload to S3

In [115]:
AWS_ACCESS_KEY_ID = 'Enter Access Key'
AWS_SECRET_ACCESS_KEY = 'Enter Secret Accesss Key'
inputLocation = 'us-east-1'
filepaths='FurtherData.zip'

In [116]:
from boto.s3.key import Key
from boto.s3.connection import S3Connection
import boto.s3
import boto3

In [117]:
def upload_to_s3(Inputlocation,Access_key,Secret_key):            
    print("Uploading files to amazon")
    try:

        buck_name="testonfiles"

        S3_client = boto3.client('s3',Inputlocation,aws_access_key_id= Access_key, aws_secret_access_key= Secret_key)
    
        if Inputlocation == 'us-east-1':
            S3_client.create_bucket(Bucket=buck_name)
        else:
            S3_client.create_bucket(Bucket=buck_name,CreateBucketConfiguration={'LocationConstraint': Inputlocation})

        print("connection successful")
        S3_client.upload_file("FurtherData.zip", buck_name,"FurtherData.zip"),
        #Callback=ProgressPercentage("CleansedData.zip")
    
        print("Files uploaded successfully")
    
    except Exception as e:
        print("Error uploading files to Amazon s3" + str(e))

In [118]:
upload_to_s3(inputLocation,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY)

Uploading files to amazon
connection successful
Files uploaded successfully


# surprise Library Python

In [80]:
from surprise.model_selection import cross_validate
from surprise import SVD
from surprise import Reader
from surprise import evaluate, print_perf

In [115]:
from surprise import KNNBasic

In [86]:
from surprise import dataset

In [81]:
df_ratings_try2.head(3)

Unnamed: 0,userId,movieId,timestamp,rating
0,68,1,1970,2.90894
1,68,2,1970,2.248639
2,68,11,1970,2.578789


In [83]:
reader = Reader(line_format='user item rating', rating_scale=(0, 5))

In [84]:
check2 = df_ratings_try2[['userId','movieId','rating']]

In [87]:
class MyDataset(dataset.DatasetAutoFolds):
    def __init__(self, df, reader):
        self.raw_ratings = [(uid, iid, r, None) for (uid, iid, r) in
                            zip(df['userId'], df['movieId'], df['rating'])]
        self.reader=reader

In [88]:
data = MyDataset(check2,reader)

In [89]:
data.split(n_folds=5)

### SVD
Singular Value Decomposition is a matrix factorization method which is used in various domains of science and technology. 

In [90]:
algo = SVD()

In [None]:
#link for concepts
https://blog.statsbot.co/singular-value-decomposition-tutorial-52c695315254

In [91]:
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])



Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.5919
MAE:  0.4567
------------
Fold 2
RMSE: 0.5845
MAE:  0.4506
------------
Fold 3
RMSE: 0.5924
MAE:  0.4575
------------
Fold 4
RMSE: 0.5977
MAE:  0.4599
------------
Fold 5
RMSE: 0.5976
MAE:  0.4616
------------
------------
Mean RMSE: 0.5928
Mean MAE : 0.4573
------------
------------


In [92]:
print_perf(perf)

        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.5919  0.5845  0.5924  0.5977  0.5976  0.5928  
MAE     0.4567  0.4506  0.4575  0.4599  0.4616  0.4573  


In [93]:
train = data.build_full_trainset()

In [94]:
algo.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11925ca58>

In [105]:
from surprise import dump

In [106]:
file_name = os.path.expanduser('~/dump_file')

In [107]:
dump.dump(file_name, algo=algo)

In [108]:
_, loaded_algo = dump.load(file_name)

In [111]:
## Creation of test dataframe-

In [96]:
test_df = check2.head(3)

In [97]:
test_df = test_df[['userId','movieId','rating']]

In [98]:
data = MyDataset(test_df,reader)

In [101]:
test = data.build_full_trainset()

In [102]:
predictions = algo.test(test.build_testset())

In [103]:
predictions 

[Prediction(uid=68, iid=1, r_ui=1.9089395525500699, est=1.696843642937886, details={'was_impossible': False}),
 Prediction(uid=68, iid=2, r_ui=1.2486392713680443, est=1.3619949663212765, details={'was_impossible': False}),
 Prediction(uid=68, iid=11, r_ui=1.578789411959057, est=1.5785831310104772, details={'was_impossible': False})]

In [112]:
#prediction made on dump file
predictions_loaded_algo = loaded_algo.test(test.build_testset())

In [113]:
predictions_loaded_algo

[Prediction(uid=68, iid=1, r_ui=1.9089395525500699, est=1.696843642937886, details={'was_impossible': False}),
 Prediction(uid=68, iid=2, r_ui=1.2486392713680443, est=1.3619949663212765, details={'was_impossible': False}),
 Prediction(uid=68, iid=11, r_ui=1.578789411959057, est=1.5785831310104772, details={'was_impossible': False})]

### KNNBasic

In [116]:
algo1 = KNNBasic()

In [117]:
data = MyDataset(check2,reader)

In [118]:
data.split(n_folds=5)

In [119]:
perf = evaluate(algo1, data, measures=['RMSE', 'MAE'])



Evaluating RMSE, MAE of algorithm KNNBasic.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.5906
MAE:  0.4524
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.5984
MAE:  0.4577
------------
Fold 3
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.6005
MAE:  0.4598
------------
Fold 4
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.5971
MAE:  0.4581
------------
Fold 5
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.6002
MAE:  0.4595
------------
------------
Mean RMSE: 0.5974
Mean MAE : 0.4575
------------
------------


In [120]:
print_perf(perf)

        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
RMSE    0.5906  0.5984  0.6005  0.5971  0.6002  0.5974  
MAE     0.4524  0.4577  0.4598  0.4581  0.4595  0.4575  
