In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

In [None]:
# Read the full of data and display some rows
df_ratings_all = pd.read_csv('/kaggle/input/the-movies-dataset/ratings.csv', usecols = ['userId','movieId', 'rating'])
df_ratings = df_ratings_all[:1000000]
df_ratings.sample(10)

In [None]:
# Check if there is any None value or duplicated line
missing_data = sum(df_ratings.isnull().any())
duplicated_data = sum(df_ratings.duplicated(['userId','movieId','rating']))
print("Missing data: {} and duplicated data: {}".format(missing_data, duplicated_data))
df_ratings.dtypes

In [None]:
# Split dataset as %80 train and %20 test
df_train = df_ratings.sample(frac=0.8)
df_test = df_ratings.drop(df_train.index)
print("Train data: {}, test data: {}".format(len(df_train), len(df_test)))

In [None]:
# Visualize the data as a bar chart according to count of ratings
fig, ax = plt.subplots()
sns.countplot(df_train.rating)
plt.title('Distribution if rating over training dataset', fontsize=10)
ax.set_ylabel('No. of Ratings (Million)')
plt.show()

In [None]:
# Visualize first 100 entry for which movie is rated by which user while the color intensity of the marks represent the rating score.
x = df_train.head(100).movieId
y = df_train.head(100).userId

plt.scatter(x, y, c=df_train.head(100).rating, alpha=0.5)
plt.colorbar();  
plt.xlabel("Movie IDs")
plt.ylabel("User IDs")
plt.show()

In [None]:
#Define the constants for the each filteres input
CONSTANTS_RECOMENDATION={"USER_ID_MODEL":1,
                         "MOVIE_ID_MODEL":260,
                         "ITEM_MOVIE":"Rocky III",
                         "CONTENT_MOVIE":"Toy Story"
}
CONSTANTS_RECOMENDATION["CONTENT_MOVIE"]

# **Collabrative Model Filtering**

In [None]:
from surprise import Reader, Dataset,SVD
from surprise.model_selection import cross_validate
# Define the format
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
data


In [None]:
svd_benchmark = []

algorithm =  SVD(n_epochs = 15, lr_all=0.01 ) # epc=15 ,lr=0.01test_rmse 0.867418

# Perform cross validation
results = cross_validate(algorithm, data, measures=['RMSE','MAE'], cv=5, verbose=True)

# Get results & append algorithm name
tmp = pd.DataFrame.from_dict(results).mean(axis=0)
tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]]))
svd_benchmark.append(tmp)
print(svd_benchmark)

In [None]:
from surprise.model_selection import train_test_split
from surprise import accuracy
trainset, testset = train_test_split(data,test_size=.25)
algorithm.fit(trainset)
predictions = algorithm.test(testset)
accuracy.rmse(predictions)

In [None]:
def get_collabration_based_model_recommendations(userId, movieId):
    return algorithm.predict(userId, movieId).est

In [None]:
get_collabration_based_model_recommendations(CONSTANTS_RECOMENDATION["USER_ID_MODEL"],CONSTANTS_RECOMENDATION["MOVIE_ID_MODEL"]) # try 260 2767 258 2761940 1628

# Content-based Recommendation

In [None]:
# Read required files
credits = pd.read_csv('/kaggle/input/the-movies-dataset/credits.csv')
print("Type of credit columns: \n", credits.dtypes)
movies = pd.read_csv('/kaggle/input/the-movies-dataset/movies_metadata.csv')
print("Type of movies columns: \n", movies.dtypes)
keywords = pd.read_csv('/kaggle/input/the-movies-dataset/keywords.csv')
print("Type of keywords columns: \n", keywords.dtypes)

In [None]:
# Merge these tree file to the one dataframe. But id of movie column is object but for other files movie id column is int64.
# Convert these int64 to string then merge
keywords['id'] = keywords['id'].astype('str')
credits['id'] = credits['id'].astype('str')
df_merge = (credits.merge(movies,on='id')).merge(keywords,on='id')
df = df_merge[:10000]
df.dtypes
# print(movies.shape)
# print(credits.shape)
# print(keywords.shape)


In [None]:
# Choose which columns will be used. And drop the null values.
df = df[['title','cast', 'crew', 'genres','keywords','original_language']]
print(sum(df.isnull().any()))
df.dropna(inplace=True)
print(sum(df.isnull().any()))
df.head()

In [None]:
# Data maybe consist of lower/upper case problems.
# Can solve this problem using regular expirations
import unicodedata
import re
def clean_data(value):
    value =  unicodedata.normalize('NFD', value).encode('ascii', 'ignore').decode('ascii')
    x = str.lower(re.sub(r"[^a-zA-Z0-9]+", '_', value.strip().replace(" ","_")))
    if(x != '_'):
        return x 
    else:
        return " " 

In [None]:
df.cast

In [None]:
import ast
df['cast'] = df['cast'].apply(lambda s: list(ast.literal_eval(s)))
df['cast'] = df['cast'].apply((lambda cast : [clean_data(actor['name']) for actor in cast]))
df.cast

In [None]:
print(df.title)
df['title'] = df['title'].apply((lambda title: clean_data(title)))
df.title

In [None]:
print(df.genres)
df['genres'] = df['genres'].apply(lambda s: list(ast.literal_eval(s)))
df['genres'] = df['genres'].apply((lambda genres : [clean_data(genre['name']) for genre in genres]))
df.genres

In [None]:
print(df.keywords)
df['keywords'] = df['keywords'].apply(lambda s: list(ast.literal_eval(s)))
df['keywords'] = df['keywords'].apply((lambda keywords : [clean_data(key['name']) for key in keywords[:5]]))
df.keywords

In [None]:
# The other crew members except director may mislead the model. So select only director name
print(df.crew)
df['crew'] = df['crew'].apply(lambda s: list(ast.literal_eval(s)))
df['crew'] = df['crew'].apply((lambda crew : [clean_data(member['name']) for member in crew if member["job"] == "Director"]))
df.crew

In [None]:
print(df.original_language)
# Dont change anything here
df.title

In [None]:
# Now cumulate all the words on a column, say bag of words :)
test_csv = pd.read_csv('../input/imdb-movie-reviews-dataset/test_data (1).csv') # path to file
train_csv = pd.read_csv('../input/imdb-movie-reviews-dataset/train_data (1).csv') # path to file
df['bag_of_words'] = df['cast'] + df['keywords'] + df['genres'] + df['crew'] 
df['bag_of_words'] = df['bag_of_words'].apply(lambda x: ' '.join(x)) + " " + df['title']
df.bag_of_words.head()


 ## Extracting the KeyWords From Bag Of Words using TD-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df.bag_of_words)
tfidf.get_feature_names()

In [None]:
# Compute cosine similarity between all movie-descriptions
similarity = cosine_similarity(tfidf_matrix)
# Remove self-similarity from matrix
similarity -= np.eye(similarity.shape[0])

In [None]:
def get_content_based_recommendations(title):
    # Get the index of given movie (if exist)
    index = df.reset_index(drop=True)[df.title == clean_data(title)].index
    if len(index) > 0:
        index = index[0]
        n_plot = 10
        # Get indices and scores of similar movies
        similar_movies_index = np.argsort(similarity[index])[::-1][:n_plot]
        similar_movies_score = np.sort(similarity[index])[::-1][:n_plot]

        # Get titles of similar movies
        similar_movie_titles = df.iloc[similar_movies_index].index
        return [df.iloc[index].title for index in similar_movies_index]
    else:
        return None

In [None]:
get_content_based_recommendations(CONSTANTS_RECOMENDATION["CONTENT_MOVIE"])

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [None]:
train_X = train_csv['0']   # '0' corresponds to Texts/Reviews
train_y = train_csv['1']   # '1' corresponds to Label (1 - positive and 0 - negative)
test_X = test_csv['0']
test_y = test_csv['1']

In [None]:
# loading TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfid = tfidf_vectorizer.fit_transform(train_X)
X_test_tfid = tfidf_vectorizer.transform(test_X)

In [None]:
#Analying train and test data with use of naive_bayes classifier
naive_bayes_classifier_tfidf = MultinomialNB()
naive_bayes_classifier_tfidf.fit(X_train_tfid,train_y)

In [None]:
y_pred_tfidf= naive_bayes_classifier_tfidf.predict(X_test_tfid)
# compute the performance measures

print("accuracy of TfIdfVectorizer:")

print(metrics.classification_report(test_y, y_pred_tfidf,
                                            target_names=['Positive', 'Negative']))

# Item-based-Recomendation

In [None]:
from sklearn.decomposition import TruncatedSVD
movies_item=movies
df_ratings_small=pd.read_csv('/kaggle/input/the-movies-dataset/ratings_small.csv')
df_ratings_small['movieId'] = df_ratings_small['movieId'].astype('str')
df_ratings_mer = df_ratings_small.merge(movies_item[['title','id']], left_on='movieId', right_on='id')
df_ratings_mer = df_ratings_mer[['userId','movieId','rating', 'title']]


In [None]:
print(len(df_ratings))
print(len(df_ratings_mer))
df_ratings_mer.dtypes
df_ratings_mer.shape
df_ratings_mer

In [None]:
rating_cross = df_ratings_mer.pivot_table(values="rating",index="userId",columns="title",fill_value=0)
rating_cross.sample(10)


In [None]:
train_matrix = rating_cross.T

In [None]:
SVD_Truncated = TruncatedSVD(n_components=12, random_state=5)
resultant_matrix = SVD_Truncated.fit_transform(train_matrix)
resultant_matrix.shape

In [None]:
### correlation matrix
corr_mat = np.corrcoef(resultant_matrix)
corr_mat.shape

In [None]:
col_idx = rating_cross.columns.get_loc(CONSTANTS_RECOMENDATION["ITEM_MOVIE"])
corr_specific = corr_mat[col_idx]
pd.DataFrame({'corr_specific':corr_specific, 'Movies': rating_cross.columns})\
.sort_values('corr_specific', ascending=False)\
.head(10)

In [None]:
reader = Reader(rating_scale=(0.5, 5.0))
Truncated_svd =  SVD(n_epochs = 19)
data_trun_test = Dataset.load_from_df(df_ratings_mer[['userId', 'movieId', 'rating']], reader)

In [None]:
trun_svd = []

# Perform cross validation
result = cross_validate(Truncated_svd, data_trun_test, measures=['RMSE','MAE'], cv=5, verbose=True)

# Get results & append algorithm name
trun_svd = pd.DataFrame.from_dict(result).mean(axis=0)
trun_svd = trun_svd.append(pd.Series([str(Truncated_svd).split(' ')[0].split('.')[-1]]))
trun_svd.append(trun_svd)
print(trun_svd)

# Hybrid System (Content,Model)

In [None]:
df_ratings_all['movieId'] = df_ratings_all['movieId'].astype('str')
df_ratings_merge = df_ratings_all.merge(movies[['title','id']], left_on='movieId', right_on='id')
df_ratings_merge = df_ratings_merge[['userId','movieId', 'rating', 'title']]
df_ratings_merge

In [None]:
print(len(df_ratings_all))
print(len(df_ratings_merge))

In [None]:
#Clean the title of new merged data
df_ratings_merge_with_different_title = df_ratings_merge
df_ratings_merge_with_different_title['title'] = df_ratings_merge_with_different_title['title'].apply(lambda x: clean_data(x))
print(df_ratings_merge_with_different_title)

In [None]:
def get_content_based_recommendation_for_user(userId):
    # Check if user id exist
    # if not exist show some of populars
    
    # Get highest ranking score of this user
    highest_ranked_movies = df_ratings_merge[df_ratings_merge['userId'] == userId]
    highest_ranked_movies.sort_values(by=['rating'], ascending=False)
    #print(highest_ranked_movies)
    similar_movies = set()
    for i in range(len(highest_ranked_movies)):
        current_similar_movies = get_content_based_recommendations(highest_ranked_movies.iloc[i].title)
        if current_similar_movies == None:
             continue
        for movie_title in current_similar_movies:
            similar_movies.add(movie_title)
    
    result_df = pd.DataFrame(similar_movies, columns=['title'])
    
    without_duplicate = df_ratings_merge_with_different_title.drop_duplicates(subset='title', keep='first', inplace = False)    
    result_df = result_df.merge(without_duplicate, on = 'title')
    
    return result_df

In [None]:
def get_recommendations_hybrid_content_model(userId):
    content_based = get_content_based_recommendation_for_user(userId)
    content_based = content_based[['title','movieId']]
    
    predicted_rating = []
    for i in range(len(content_based)):
        ratings = get_collabration_based_model_recommendations(userId, int(content_based.iloc[i].movieId))
        predicted_rating.append(ratings)
    content_based['predicted_rating'] = predicted_rating
    content_based = content_based.sort_values(by=['predicted_rating'], ascending=False)

    return content_based[:10]

In [None]:
get_recommendations_hybrid_content_model(1)

# Hybrid System (Item,Model)

In [None]:
corr_mat -= np.eye(corr_mat.shape[0])
def get_item_based_recommendations(title):
    # Get the index of given movie (if exist)
    index = df.reset_index(drop=True)[df.title == title].index
    if len(index) > 0:
        index = index[0]
        n_plot = 10
        # Get indices and scores of similar movies
        similar_movies_index = np.argsort(corr_mat[index])[::-1][:n_plot]
        similar_movies_score = np.sort(corr_mat[index])[::-1][:n_plot]

        # Get titles of similar movies
        similar_movie_titles = df.iloc[similar_movies_index].index
        return [df.iloc[index].title for index in similar_movies_index]
    else:
        return None

In [None]:
def get_item_based_recommendation_for_user(userId):
    # Check if user id exist
    # if not exist show some of populars
    
    # Get highest ranking score of this user
    highest_ranked_movies = df_ratings_merge[df_ratings_merge['userId'] == userId]
    highest_ranked_movies.sort_values(by=['rating'], ascending=False)
    #print(highest_ranked_movies)
    similar_movies = set()
    for i in range(len(highest_ranked_movies)):
        current_similar_movies = get_item_based_recommendations(highest_ranked_movies.iloc[i].title)
        if current_similar_movies == None:
             continue
        for movie_title in current_similar_movies:
            similar_movies.add(movie_title)
    
    result_df = pd.DataFrame(similar_movies, columns=['title'])
    
    without_duplicate = df_ratings_merge_with_different_title.drop_duplicates(subset='title', keep='first', inplace = False)    
    result_df = result_df.merge(without_duplicate, on = 'title')
    
    return result_df

In [None]:
def get_recommendations_hybrid_item_model(userId):
    item_based = get_item_based_recommendation_for_user(userId)
    item_based = item_based[['title','movieId']]
    
    predicted_rating = []
    for i in range(len(item_based)):
        ratings = get_collabration_based_model_recommendations(userId, int(item_based.iloc[i].movieId))
        predicted_rating.append(ratings)
    item_based['predicted_rating'] = predicted_rating
    item_based = item_based.sort_values(by=['predicted_rating'], ascending=False)

    return item_based[:10]

In [None]:
get_recommendations_hybrid_item_model(1)