In [118]:
#!pip install --user tensorflow==2.6.0 --ignore-installed
#!pip install pyarrow
#!pip install numpy
#!pip install pandas
#!pip install dask
# Importing Libraries
import pandas as pd
import numpy as np
import string
import re
from sklearn.metrics.pairwise import cosine_similarity
import dask.dataframe as dd
import pyarrow as pa
from sklearn.feature_extraction.text import CountVectorizer
from oauth2client.service_account import ServiceAccountCredentials
from google.cloud import storage
import os


In [144]:
# 1 - Retreives .csv file from Google Cloud Storage Bucket or GitHub
def ReadData(url):
    movies = pd.read_csv(url)
    return movies
# 2 - Cleans text features
def CleanText(text):
    print("Cleaning Text Features... ")
    clean_features = text.fillna(' ', inplace=False)
    for col in clean_features:
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace(' ', ''))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace(',', ' '))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).lower())
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace('&', ''))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace('.', ''))
    return clean_features


# 3 - Creates a Bag-of-Words for each movie
def CreateBow(clean_text):
    print("Creating Bag-of-Words... ")
    bow_list = clean_text.agg(' '.join, axis=1)
    # for loop removes duplicate words within each movie's BoW
    for bow in bow_list:
        re.sub(r'\b(.+)\s+\1\b', r'\1', str(bow_list))
    return bow_list

# 4 - Creates a word-count matrix from the bag-of-words list
def CountVectorize(bow_list):
    print("Converting BoW to Matrix...")
    count_vectorizer = CountVectorizer()
    count_vectorizer.fit(bow_list)
    count_transform = count_vectorizer.transform(bow_list)
    soup_count_array = count_transform.toarray()
    count_matrix = pd.DataFrame(soup_count_array)
    return count_matrix


# 5 - Trims matrix to only include terms seen more than x amount of time (count_treshold)
def TrimMatrix(count_matrix, count_treshold):
    print("Trimming Matrix (May Take a Few Minutes)...")
    compact_matrix = count_matrix.loc[:, (count_matrix.sum(axis=0) > count_treshold)]
    return compact_matrix

# 6 - Calculates similarities between each movie
def SimilarityScores(matrix):
    print("Calculating Similarity Scores (Also May Take a Few Minutes)...")
    sim_df = cosine_similarity(matrix,matrix)
    print("Done!")
    return sim_df


# 7 - Sorts and saves top 15 similarity scores for each movie,
# then sorts the top 15 by Tomatometer Rating (High to Low).
# The function will then provide top 5 ratings from the most similar movies
def CreateRecDf(sim_df, ratings):
    print("Creating Top Recommendations From Sim Scores...")
    index_df=pd.DataFrame([])
    ratings_list= ratings.values.tolist()
    for col in range(len(sim_df)):
        sim_list=sim_df[col]
        print(sim_list)
        movie_df=pd.DataFrame({
        'sim_score': sim_list,
        'rating': ratings_list
        })
        sim_sort = movie_df.sort_values(by = 'sim_score', ascending=False)
        top15 = sim_sort[1:16]
        top15_sort = top15.sort_values(by = 'rating', ascending=False)
        top5 = top15_sort[0:5]
        idx_list = pd.DataFrame(top5.index.values)
        idx_list = idx_list.transpose()
        index_df = index_df.append(idx_list)
    index_df=pd.DataFrame(index_df)
    index_df.columns=['rec1','rec2','rec3','rec4','rec5']
    index_df.reset_index(inplace=True,drop=True)
    print("Done!")
    return index_df

# 8 - Saves as .csv and Uploads to Google Cloud Storage Bucket
def SaveUploadCsv(index_df):
    print("Saving to .csv File and Uploading to Google Storage Bucket...")
    index_df.to_csv('./RecIndices.csv')
    print("Done! Saved to Bucket at gs://data_bucket_rr1 !")
    return

In [126]:
def main():
        # Defining GitHub URL to read .csv file
    url = 'gs://data_bucket_rr1/rotten_tomatoes_movies.csv'
    movies = ReadData(url)
        # Defining text features for Word-Count Matrix
    text_features = movies[['genres', 'directors', 'authors', 'actors', 'production_company']]
        # Defining Rotten Tomatoes Ratings
    ratings = movies[['tomatometer_rating']]

        # 1 - CleanText()
    clean_text = CleanText(text_features)

        # 2 -  CreateBow()
    bow_list = CreateBow(clean_text)

        # 3 - CountVectorize()
    count_matrix = CountVectorize(bow_list)

        # 4 - TrimMatrix() - Only includes words that appear X amount of times across all movies
    minimum_freq = 5
    trimmed_matrix = TrimMatrix(count_matrix, minimum_freq)

        # 5 - SimilarityScores()
    sim_df = SimilarityScores(trimmed_matrix)

        # 6 - CreateRecDf()
    recommendation_df = CreateRecDf(sim_df, ratings)

        # 7 - SaveUploadCsv()
    SaveUploadCsv(index_df)

if __name__ == "__main__":
    main()


Cleaning Text Features... 
Creating Bag-of-Words... 
Converting BoW to Matrix...
Trimming Matrix (May Take a While)...
Calculating Similarity Scores...
Done!
Creating Top Recommendations Data Frame From Sim Scores...


IndexError: arrays used as indices must be of integer (or boolean) type

In [145]:
##print(recommendation_df)
    # 7 - SaveUploadCsv()
SaveUploadCsv(index_df)

Saving to .csv File and Uploading to Google Storage Bucket...
Done! Saved to Bucket at gs://data_bucket_rr1 !


In [31]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    
    bucket_name = "data_bucket_rr1"
    source_file_name = r"C:\Users\Robert Rindos\sim_matrix.csv"
    destination_blob_name = "sim_matrix.csv"

    storage_client = storage.Client(project='Movie Recommendation')
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

In [34]:
bucket_name = "data_bucket_rr1"
source_file_name = r"C:\Users\Robert Rindos\sim_matrix.csv"
destination_blob_name = "sim_matrix.csv"
upload_blob(bucket_name, source_file_name, destination_blob_name)

SSLError: HTTPSConnectionPool(host='storage.googleapis.com', port=443): Max retries exceeded with url: /upload/storage/v1/b/data_bucket_rr1/o?uploadType=resumable&upload_id=ADPycdtRX1crt4TkqXlZrxVe7hLA0i8A-9FmcFAKFwF3N0YVi45iANPkndewjPixwW_11sOSo-gNeyIOhtSINprbNZk (Caused by SSLError(SSLWantWriteError(3, 'The operation did not complete (write) (_ssl.c:2474)')))

In [33]:
!pip install --upgrade google-api-python-client
!pip install --upgrade google-auth-oauthlib
!pip install --upgrade google-auth-httplib2
!pip install --upgrade google-auth

Collecting google-api-python-client
  Downloading google_api_python_client-2.19.1-py2.py3-none-any.whl (7.4 MB)
Collecting google-auth-httplib2>=0.1.0
  Downloading google_auth_httplib2-0.1.0-py2.py3-none-any.whl (9.3 kB)
Collecting uritemplate<4dev,>=3.0.0
  Downloading uritemplate-3.0.1-py2.py3-none-any.whl (15 kB)
Installing collected packages: uritemplate, google-auth-httplib2, google-api-python-client
Successfully installed google-api-python-client-2.19.1 google-auth-httplib2-0.1.0 uritemplate-3.0.1


In [15]:
print("Converting to .csv file (Almost Done)...")

credentials_dict = {
  "type": "service_account",
  "project_id": "cellular-fold-316319",
  "private_key_id": "72bfe1491eabae568945c86b84faa72b7d7355f4",
  "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDQ9P5a4S2iiQh9\nUAopQmg07mTGogxYjaaJ/F4fQ9xaw+S+bsLSjmz7wLTnM0/DlE7HzawDywdSzYit\nvrDAghT1mMYHbjmkXp835+EUW973+bPLajK/5APTkcf65qYEOaiPJsba5X8NayNw\nEs09dA0GxpyM3gjdXpInGhCAVgYGwVvbRxHXLKzF9nk2+cry0qqL5he485p/xO8+\nNTPCzyzZGr8vVj01V8bZBjqjhAy8retetfCj+X6VkqVMjrl1hLgLVieE/PjkF1I+\nq3f6hB5kNqoGMcm8esne+sTjxl7iHpizaw22kzVyW/F0XBUa04YyY27Ir/m+nEid\nIZy9GRrRAgMBAAECggEAAvyMbd636vG504YvRf6RLBf/7FucsO+R2rkm3HwpICo6\neMsPGWiwF7NN9pu78y35iAcpUyrNw44E2wRdpPu6+lVlF644cFAqpwc3NyX2wG0z\nB9hQSkAzWozT6pLtOqRbE5Q0nYRwEeBXEuMO6EbiSIdyTXxfLLBNRxYrGBV6q7Il\nRsYbUEwcD/RDJRCKwwjPGBE29ukfK+a+bUmmatm+zvrKfMpJ4iwMRwu9iT9ThTFU\nphDukkR7f6YRINrrOXNtDba8jQs2UEqIDFZJ4gQeTdpSM6dflQOIA+ShcYIax6Mx\nETq3rguUPPcObL3gPZIz4IuE2AD8OyaSwjnEScDB4QKBgQD93TL0UnQ6Pb00eCdC\nna35ZQ6GUUJ647lwtnJdsv2hpGeeDC6MJ17S0sIafJwRAvbic5lmn/QJdCwBXci6\nLDf4e6oExsN+XEpMAH7yaXLMUb0Cy1I1Pqt2Q2n/RUUkCX4mHNYmasFxa1VRLSAX\nxzbeuPXONmhacnffQHnhw1vvYQKBgQDStxGUld/KS98eofjTTKJtgj781JAMYvb1\nQUdq8ZdSIFzXxG5g5uineU4PypE0YSd1CcSMjr4JFqfmubj5+bXVEBAXqn3pbCd2\npAYqO/NKDT+LU2G+pSszLG2CXZljxouWH0ZwXxoYvU6jNAD6WhEyuR6JSHu5kNQ9\nJEeGFf8RcQKBgErNlgkWDrdmn8zJJtBjQ/M7ytbbNNqfJXJH4FUpu8CYgAu7DTAx\nHajDf3xsgVQl6nd5lpb9xNvQRnM0nkP1aZynW8bChrJCIYZZkw588d2wrCAZpySX\nTdCwNpiIYcInpes3DgcN5+gyt1YPFOjMw6dHGu7ja0ZTemznQ8K5XOqhAoGAN89B\nGsevZFdAG07vJpfvlW2f3Ts3g/BaVI8bqVmfqKLltEhUq9m82U6pvEj8uJDa8IbL\nxiawcpvLkLK0KZFX9byxZnNDsiVxNl0vPNRwFbb4k6BcDTCWBpHH13WPAPw/UF2s\nPGsT8GWDhHprn+NnFrYtPlMyYe3YKPGI+ByvWCECgYEAmKV2hL1+wfxMLrWEueqp\nnpq3RbspM//2BIxCQUbPf1awsONZpD6XnWM6Auy4YPAWoKx8eJBClmVSG5gUEIg0\ngj0Y+0G3YJ9ak5rOiLflroE4meVP9JZ1498iNfXclWqs+6l+UML7mK3xDXLY9tIj\n77rJcrPn6/k8gR+gqsFEpNM=\n-----END PRIVATE KEY-----\n",
  "client_email": "service-account-rr1@cellular-fold-316319.iam.gserviceaccount.com",
  "client_id": "101903527339742429473",
  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
  "token_uri": "https://oauth2.googleapis.com/token",
  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/service-account-rr1%40cellular-fold-316319.iam.gserviceaccount.com"
}
credentials = ServiceAccountCredentials.from_json_keyfile_dict(
    credentials_dict
)

client = storage.Client(project = 'Movie Recommendation',credentials=credentials)
bucket = client.get_bucket('data_bucket_rr1')
bucket.blob('sim_matrix.csv').upload_from_string(sim_matrix.to_csv(), 'text/csv')
    
print("Done!")


Converting to .csv file (This Will at)...


SSLError: HTTPSConnectionPool(host='storage.googleapis.com', port=443): Max retries exceeded with url: /upload/storage/v1/b/data_bucket_rr1/o?uploadType=resumable&upload_id=ADPycdsj0_ge4N0Fij7fevTCwhi3aq6DcYcDSWxaFddzgUSTrG78mg6DhLTcfIE2q3bvB6kfyP-f3diK6Rq3tOrRb3w (Caused by SSLError(SSLWantWriteError(3, 'The operation did not complete (write) (_ssl.c:2474)')))