# 1. ML Project 2

# 1.1 Introduction

The objective of this project is to recommend users and movies

# 1.2 Agenda

1. Data Set Selection
2. EDA
3. Demographic filtering
4. Content-based recommenders
5. Collaborative filtering 
6. AutoML

# 1.3 Team Members

1. Eden Zere
2. Essey Abraham Tezare
3. Mario Arismendi Matos

# 2 1. Data Set Selection And EDA

# 2.1 1.1 Import libraries

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
%matplotlib inline

from datetime import datetime
import datetime
import wordcloud as wc
import plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        saving=False



# 2.2 1.2 Reading the data

In [None]:
df1=pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_credits.csv')
df2=pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_movies.csv')

The first dataset contains the following features:-

* movie_id - A unique identifier for each movie.
* cast - The name of lead and supporting actors.
* crew - The name of Director, Editor, Composer, Writer etc.

The second dataset has the following features:- 

* budget - The budget in which the movie was made.
* genre - The genre of the movie, Action, Comedy ,Thriller etc.
* homepage - A link to the homepage of the movie.
* id - This is infact the movie_id as in the first dataset.
* keywords - The keywords or tags related to the movie.
* original_language - The language in which the movie was made.
* original_title - The title of the movie before translation or adaptation.
* overview - A brief description of the movie.
* popularity - A numeric quantity specifying the movie popularity.
* production_companies - The production house of the movie.
* production_countries - The country in which it was produced.
* release_date - The date on which it was released.
* revenue - The worldwide revenue generated by the movie.
* runtime - The running time of the movie in minutes.
* status - "Released" or "Rumored".
* tagline - Movie's tagline.
* title - Title of the movie.
* vote_average -  average ratings the movie recieved.
* vote_count - the count of votes recieved.




Let's join the two dataset on the 'id' column

In [None]:
df1=df1.rename({'movie_id': 'id'},axis=1)
df1.columns = ['id','title2','cast','crew']
df2= df2.merge(df1,on='id')

# 2.2 1.3 Training Data Info

In [None]:
df2.info()

Just a peak at our data.

In [None]:
df2.drop(columns=['title2'],inplace=True)
df2.head(5)

# 2.2 1.4 Checking for Null Data

In [None]:
df2.isnull()

In [None]:
sns.heatmap(df2.isnull(),yticklabels=False,cbar=False,cmap='viridis')

as we can see here there are a lot of nullable values in the homepage and the tagline columns 

# 2.3 1.5 Distict values

In [None]:
df2.select_dtypes('object').nunique()

# 2.4 1.6  Checking Distribution 

In [None]:
plt.figure(figsize=(25,6))


plt.subplot(2, 3, 1)
sns.distplot(df2['revenue'])

plt.subplot(2, 3, 2)
sns.distplot(df2['vote_count'])

plt.subplot(2, 3, 3)
sns.distplot(df2['budget'])

plt.subplot(2, 3, 4)
sns.distplot(df2['vote_average'].fillna(0).astype(int))

plt.subplot(2, 3, 5)
sns.distplot(df2['runtime'].fillna(0).astype(int))

plt.subplot(2, 3, 6)
sns.distplot(df2['popularity'].fillna(0).astype(int))

plt.suptitle('Checking for Skewness', fontsize = 15)
plt.show()

# 2.5 1.7 Revenue vs Movies

In [None]:
pop= df2.sort_values('revenue', ascending=False)
import matplotlib.pyplot as plt
plt.figure(figsize=(12,4))

plt.barh(pop['title'].head(6),pop['revenue'].head(6), align='center',
        color='skyblue')
plt.gca().invert_yaxis()
plt.xlabel("revenue")
plt.title("revenue Movies")


as above we can tell that Avatar has the highest revenue

# 2.6 1.8 Popularity vs Movies

In [None]:
pop= df2.sort_values('popularity', ascending=False)
import matplotlib.pyplot as plt
plt.figure(figsize=(12,4))

plt.barh(pop['title'].head(6),pop['popularity'].head(6), align='center',
        color='skyblue')
plt.gca().invert_yaxis()
plt.xlabel("Popularity")
plt.title("Popular Movies")


As above we can see that Minions is the most popular movie 

# 2.7 1.9 Languages vs Movies

In [None]:
movies = df2
movies['spoken_languages'] = movies['spoken_languages'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])


s = movies.apply(lambda x: pd.Series(x['spoken_languages']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'spoken_languages_count'
con_df = movies.drop('spoken_languages', axis=1).join(s)
con_df = pd.DataFrame(con_df['spoken_languages_count'].value_counts())
con_df['spoken_language'] = con_df.index
con_df.columns = ['num_spoken_language', 'spoken_language']



In [None]:
con_df = con_df.reset_index().drop('index', axis=1)
con_df.head(100)

from the spoken language the highest is English with 4485

In [None]:
con_df = con_df[:5]

fig = plt.figure(figsize=(12,7))
sns.barplot(data = con_df, x='spoken_language', y = 'num_spoken_language')

plt.tight_layout()

The chart is telling us the highest language is English

# 2.8 2.0 Countries Vs Movies

In [None]:
movies = df2
movies['production_countries'] = movies['production_countries'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

s = movies.apply(lambda x: pd.Series(x['production_countries']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'countries'

In [None]:
con_df = movies.drop('production_countries', axis=1).join(s)
con_df = pd.DataFrame(con_df['countries'].value_counts())
con_df['country'] = con_df.index
con_df.columns = ['num_movies', 'country']
con_df = con_df.reset_index().drop('index', axis=1)
con_df.head(20)


The table shows us most movies are made in USA

In [None]:
con_df.loc[con_df.country == 'United States of America', 'num_movies'] = 700
con_df.head(20)
con_df.to_csv('mycsvfile.csv')

In [None]:
data = [ dict(
        type = 'choropleth',
        locations = con_df['country'],
        locationmode = 'country names',
        z = con_df['num_movies'],
        text = con_df['country'],
        colorscale = [[0,'rgb(255, 255, 255)'],[1,'rgb(255, 0,255)']],
        autocolorscale = False,
        reversescale = False,
        marker = dict(
            line = dict (
                color = 'rgb(0,0,0)',
                width = 0.5
            ) ),
        colorbar = dict(
            autotick = False,
            tickprefix = '',
            title = 'Production Countries'),
      ) ]

layout = dict(
    title = 'Production Countries for the Movies (USA is being 700+ to be apple to watch other countries)',
    geo = dict(
        showframe = False,
        showcoastlines = False,
        projection = dict(
            type = 'Mercator'
        )
    )
)

fig = dict( data=data, layout=layout )
py.iplot( fig, validate=False, filename='d3-world-map' )

# 2.9 2.1 Years Vs Movies

In [None]:
# helper functions to deal with multi-hot features
def group_indices(series,index="id"):
    d={}
    for i in range(series.size):
        l=eval(series.iloc[i])
        for x in l:
            d.setdefault(x[index],[])
            d[x[index]].append(i)
    return d

def get_groups(series,index="name"):
    s=set()
    for i in range(series.size):
        l=eval(series.iloc[i])
        for x in l:s.add(x[index])
    return list(s)

def multi_count(series,index="id"):
    return {k:len(v) for (k,v) in group_indices(series,index).items()}

def expand_multi_feature(df,column,index="id"):
    groups=group_indices(df[column],index=index)
    result=pd.DataFrame()
    for name,indices in groups.items():
        rows=df.iloc[indices].copy()
        rows[column]=name
        result=result.append(rows)
    return result

def multi_groupby(df1,column,index="id"):
    return expand_multi_feature(df,column,index).groupby(column)

In [None]:
# numbers of movies released in each decade
def count_pie(series,filename):
    counts=series.value_counts()
    counts=counts/counts.sum()
    labels=['' if num<0.01 else str(year) for (year,num) in counts.items()]
    f, ax = plt.subplots(figsize=(8, 8))
    explode = [0.02 if counts.iloc[i] < 100 else 0.001 for i in range(counts.size)]
    plt.pie(counts,labels=labels,autopct=lambda x:'{:1.0f}%'.format(x) if x > 1 else '',explode=explode)
    if saving:plt.savefig(filename,dpi=150)
    plt.show()

def count_decade_pie(df,filename):
    count_pie(df2.release_date.dropna().apply(lambda x:str(int(x[:4])//10*10)+'s'),filename)
    
count_decade_pie(df2,filename="pie_decade.png")

The pie chart tells us that in 2000s there were alot of movies released

# 2.1.1 2.2 Genres

In [None]:
# wordcloud of genres and keywords
def multi_wordcloud(series,filename):
    w=wc.WordCloud(background_color="white",margin=20,width=800,height=600,prefer_horizontal=0.7,max_words=50,scale=2)
    count=multi_count(series,"name")
    w.generate_from_frequencies(count)
    if saving:w.to_file(filename)
    f, ax = plt.subplots(figsize=(16, 8))
    plt.axis('off')
    plt.imshow(w)
    plt.show()

multi_wordcloud(df2.genres,filename="wordcloud_genres.png")
multi_wordcloud(df2.keywords,filename="wordcloud_genres2.png")

Above tells us that most movies are Drama,Comedy and thriller and from the keywords we have most repeated key is independent

# 2.1.2 2.3 Popularity vs Genres

In [None]:
# distribution of popularity and runtime groupby genres
def plotby_box(df,x,y,filename,yscale="linear"):
    sns.set(style="whitegrid")
    df=df.replace(0,np.nan).copy()
    f,ax=plt.subplots(figsize=(20, 10))
    sns.boxenplot(data=expand_multi_feature(df,x,"name"),x=x,y=y)
    plt.yscale(yscale)
    plt.yticks(fontsize=20)
    plt.xticks(rotation=55,fontsize=20)
    plt.xlabel(x,fontsize=30)
    plt.ylabel(y,fontsize=30)
    if saving:plt.savefig(filename,bbox_inches="tight",dpi=150)
    plt.show()
    
plotby_box(df2,"genres","popularity",yscale="log",filename="genres_popularity.png")

# 2.1.3 2.4 Vote_Average vs Genres

In [None]:
def plotby_bar(df,x,y,filename):
    sns.set(style="whitegrid")
    df=df.replace(0,np.nan).copy()
    f,ax=plt.subplots(figsize=(20, 10))
    sns.barplot(data=expand_multi_feature(df,x,"name"),x=x,y=y)
    plt.yticks(fontsize=20)
    plt.xticks(rotation=55,fontsize=20)
    plt.xlabel(x,fontsize=30)
    plt.ylabel(y,fontsize=30)
    if saving:plt.savefig(filename,bbox_inches="tight",dpi=150)
    plt.show()
    
plotby_bar(df2,"genres","vote_average",filename="genres_vote.png")


The highest average vote for the genres are war, history and documentary

# 2.1.4 2.5 Rate vs Movies

In [None]:
# Filter only votes to movies in movies metadata
ratings = pd.read_csv('../input/the-movies-dataset/ratings_small.csv')
ratings_df = ratings.merge(df2[['id']], left_on=['movieId'], right_on=['id'], how='inner')
# add a new feature, time_dt, to ratings_df by converting timestamp to date
ratings_df['time_dt'] = ratings_df['timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(x))
# split the time_dt to year features
ratings_df['year'] = ratings_df['time_dt'].dt.year

In [None]:
dt = ratings_df.groupby(['year'])['rating'].mean().reset_index()
fig, (ax) = plt.subplots(ncols=1, figsize=(12,5))
plt.plot(dt['year'],dt['rating']);
plt.xlabel('Year');
plt.ylabel('Average ratings');
plt.title('Average ratings per year')
plt.show()

we can tell that the movies most rated are in 2015

# 2.1.3 2.4 Correlation Matrix

In [None]:
plt.figure(figsize=(10,7))
plt.title('Correlation Matrix')
# mask = np.triu(np.ones_like(md.corr(), dtype=np.bool))
sns.heatmap(df2.corr(),annot=True)
plt.show()

It is showing us that popularity and vote_count, and revenue and vote_count have the highest which is 0.78

# 3.1 Demographic Filtering

The most basic form of a recommendation engine would be where the engine recommends the most popular items to all the users. That would be generalized as everyone would be getting similar recommendations as we didn’t personalize the recommendations.

In [None]:
df2.head()
df2.columns

In [None]:
C = df2['vote_average'].mean()
m = df2['vote_count'].quantile(0.9)
C, m

This means the vote_average mean is 6.09 ,and we will only consider movies that have a minimum vote count of 1838.4

In [None]:
# Filter out movies that don't have 90 % of vote count
q_movies = df2.copy().loc[df2['vote_count'] >= m]
q_movies.shape

There are 481 movies which are less than 90% of vote count

In [None]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

q_movies['score'] = q_movies.apply(weighted_rating, axis=1)
q_movies = q_movies.sort_values('score', ascending=False)

q_movies[['title', 'vote_count', 'vote_average', 'score']].head(10)


In [None]:
pop = df2.sort_values('popularity', ascending=False)
import matplotlib.pyplot as plt
plt.figure(figsize=(12,4))

plt.barh(pop['title'].head(6), pop['popularity'].head(6), align='center') 
plt.gca().invert_yaxis()
plt.xlabel('Popularity')
plt.title('Popular Movies')

# 4.1 Content Based

In this technique, the users are recommended the similar content which they have used/watched/liked the most before.

TfidVectorizer it helps us to put each word in a column and stop the common english word like example 'the' word

In [None]:
df2['overview'].head(5)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words='english')

df2['overview'] = df2['overview'].fillna('')

tfidf_matrix = tfidf.fit_transform(df2['overview'])

tfidf_matrix.shape

cosine_sim = linear_kernel(tfidf_matrix)

In [None]:
# Trying cosine similarity

documents = [
    'alpine snow winter boots.',
    'snow winter jacket.',
    'active swimming briefs',
    'active running shorts',
    'alpine winter gloves'
]

cntvt = CountVectorizer(stop_words='english')

tfidf_matrix = cntvt.fit_transform(documents)
cntvt.get_feature_names()
tfidf_matrix.todense()

cos_sim = cosine_similarity(tfidf_matrix)
cos_sim

cosine_similarity tells us using some equation how similar they are,so as above we can see that 'alpine snow winter' boots is similar with 'snow winter jacket' because they both have 'snow winter' content in the sentence so that gave us result of 0.577

In [None]:
indices = pd.Series(df2.index, index=df2['title']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df2['title'].iloc[movie_indices]

idx = indices["The Dark Knight Rises"]
df2['title'].iloc[[i[0] for i in (sorted(list(enumerate(cosine_sim[idx])), key=lambda x: x[1], reverse=True)[1:11])]]

using cosine_similarity with overview for the movie 'The Dark Knight Rises' we got this result

In [None]:
get_recommendations('The Dark Knight Rises')

In [None]:
get_recommendations('The Avengers')

In [None]:
# literal_eval is a python function to evaluate correctness of string data. It
# will also create python objects for you 
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']

df2['cast'][0]
df2['crew'][0]
df2['keywords'][0]
df2['genres'][0]

for feature in features:
    df2[feature] = df2[feature].apply(literal_eval)

In [None]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [None]:
# return top 3
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names) > 3:
            names = names[:3]
        return names
    
    return []

In [None]:
df2['director'] = df2['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(get_list)
    
df2[['title', 'cast', 'director', 'keywords', 'genres']].head(5)

In [None]:
#data cleaning and prepa

def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [None]:
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df2[feature] = df2[feature].apply(clean_data)

Joining the keywords, cast, director and genres

In [None]:
# create "soup" for the vectorization used to compute the cosine similarity matrix

def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
df2['soup'] = df2.apply(create_soup, axis=1)

In [None]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df2['soup'])

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
df2 = df2.reset_index()
indices = pd.Series(df2.index, index=df2['title'])

In [None]:
get_recommendations('The Dark Knight Rises', cosine_sim2)

Using cosine_similarity with keywords, genres, director and cast we get movie recommendation for 'The Dark Knight Rises' as above which is better result

In [None]:
get_recommendations('The Godfather', cosine_sim2)

# 5.1 Collaborative Filtering

In collaborative filtering, two entities collaborate to deduce recommendations on the basis of certain similarities between them. These filtering techniques are broadly of two types:

1.User Based Collaborative Filtering: In user based collaborative filtering, we find out the similarity score between the two users. On the basis of similarity score, we recommend the items bought/liked by one user to other user assuming that he might like these items on the basis of similarity. This will be more clear when we go ahead and implement this. Major online streaming service, Netflix have their recommendation engine based on user based collaborative filtering.

2.Item Based Collaborative Filtering: In item based collaborative filtering, the similarity of an item is calculated with the existing item being consumed by the existing users. Then on the basis of amount of similarity, we can say that if user X likes item A and a new item P is most similar to item A then it highly makes sense for us to recommend item P to user X.

In [None]:
# User-User, Item-Item Collaborative filtering
from surprise import Reader, Dataset, SVD #, cross_validate #evaluate
from surprise.model_selection import cross_validate, KFold
reader=Reader(rating_scale=(1,5))

read user rating file

In [None]:
#read the user rating file (subset file to improve processing time)
ratings=pd.read_csv('../input/the-movies-dataset/ratings_small.csv')
ratings.head()

We used SVD(Singular Value Decompostion)
the SVD is used as a collaborative filtering technique. It uses a matrix structure where each row represents a user, and each column represents an item. The elements of this matrix are the ratings that are given to items by users.

Root-Mean-Square Error (RMSE) was used for evaluation and is
defined as follows:

RMSE <- function(true_ratings, predicted_ratings){
    sqrt(mean((true_ratings - predicted_ratings)^2))
}

RMSE was the metric used to judge entries in the Netflix challenge. The lower the RMSE was on Netflix’s quiz set between the submitted rating predictions and the actual ratings, the better the method was.
so we are using RMSE on our recommendation system

In [None]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
# data.split(n_folds=5)

svd= SVD()
# evaluate(svd, data, measures=['RMSE','MAE'])
# cross_validate(NormalPredictor(), data, cv=5)

# Run 5-fold cross-validation and print results
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

From the above we got RMSE of mean of 0.89 which is good

In [None]:
#create a training set for svd
trainset = data.build_full_trainset()
svd.fit(trainset)

In [None]:
#getting all userId =1 with the rattings
ratings[ratings['userId'] == 1]

In [None]:
str(svd.predict(1, 302).est)

For movie with Id 302, we get a prediction of estimated rate 2.85 out 5 with user Id 1.

# 6.1 AutoMl

In [None]:
try:  # SciPy >= 0.19
    from scipy.special import comb, logsumexp
except ImportError:
    from scipy.misc import comb, logsumexp  # noqa 
!pip install auto-sklearn
!apt-get remove swig 
!apt-get install swig3.0 build-essential -y
!ln -s /usr/bin/swig3.0 /usr/bin/swig
!apt-get install build-essential
!pip install --upgrade setuptools
# !pip install sklearn

import autosklearn.classification
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
import os  
import autosklearn.regression
from sklearn.model_selection import train_test_split

In [None]:
# from scipy.special import comb
# import sklearn
# import sklearn.model_selection
movies = df2.dropna(subset=['vote_average', 'budget', 'revenue'], how='all')
X = movies[['budget', 'revenue']]
y = movies['vote_average']

 

X = X.iloc[:, :].values
y = y.iloc[:].values
y = y.astype(int)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_train, X_test, y_train, y_test = \
    sklearn.model_selection.train_test_split(X, y, random_state=1)
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    per_run_time_limit=30,
    tmp_folder='/tmp/autosklearn_cv_example_tto2',
    output_folder='/tmp/autosklearn_cv_example_oto22',
    delete_tmp_folder_after_terminate=False,
    resampling_strategy='cv',
    resampling_strategy_arguments={'folds': 5},
)

 

# fit() changes the data in place, but refit needs the original data. We
# therefore copy the data. In practice, one should reload the data
automl.fit(X_train.copy(), y_train.copy(), dataset_name='movie_recommendation')
# During fit(), models are fit on individual cross-validation folds. To use
# all available data, we call refit() which trains all models in the
# final ensemble on the whole dataset.
automl.refit(X_train.copy(), y_train.copy())

 

print(automl.show_models())

In [None]:
predictions = automl.predict(X_test)
print("Accuracy as per AutoML: ", sklearn.metrics.accuracy_score(y_test, predictions))

# 7.1 References

* https://www.kaggle.com/sjj118/movie-visualization-recommendation-prediction
* https://www.kaggle.com/rounakbanik/movie-recommender-systems
* https://www.kaggle.com/ibtesama/getting-started-with-a-movie-recommendation-system
* https://surprise.readthedocs.io/en/stable/getting_started.html
* https://medium.com/@gracy.f/automl-for-python-on-windows-314ca8ea6955