<a href="https://colab.research.google.com/github/shivamchoudhury06/ML_Projects/blob/master/Copy_of_HACKANONS_COLAB_25GB_RAM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', '{:.2f}'.format)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

#from fuzzywuzzy import fuzz

from google.colab import drive
drive.mount('/content/drive')


# Ignore DtypeWarnings from pandas' read_csv
#warnings.filterwarnings('ignore', message="^Columns.*")

Mounted at /content/drive


In [None]:
movie = pd.read_csv("/content/drive/My Drive/home-credit-default-risk/movies.csv")

In [None]:
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
# the function to extract titles
def extract_title(title):
   year = title[len(title)-5:len(title)-1]

   # some movies do not have the info about year in the column title. So, we should take care of the case as well.

   if year.isnumeric():
      title_no_year = title[:len(title)-7]
      return title_no_year
   else:
      return title
# the function to extract years
def extract_year(title):
   year = title[len(title)-5:len(title)-1]
   # some movies do not have the info about year in the column title. So, we should take care of the case as well.
   if year.isnumeric():
      return int(year)
   else:
      return np.nan
# change the column name from title to title_year
movie.rename(columns={'title':'title_year'}, inplace=True)
# remove leading and ending whitespaces in title_year
movie['title_year'] = movie['title_year'].apply(lambda x: x.strip())
# create the columns for title and year
movie['title'] = movie['title_year'].apply(extract_title)
movie['year'] = movie['title_year'].apply(extract_year)

In [None]:
r,c = movie[movie['genres']=='(no genres listed)'].shape
print('The number of movies which do not have info about genres:',r)

The number of movies which do not have info about genres: 7080


In [None]:
# remove the movies without genre information and reset the index
movies = movie[~(movie['genres']=='(no genres listed)')].reset_index(drop=True)

In [None]:
top_10000_movies = movies[:1000]

# Using tf-idf Vectorizer


In [None]:
#if-idf Vectorization
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(top_10000_movies["genres"])


In [None]:
# Content Similarity
cosine_similarities = linear_kernel(matrix,matrix)
movie_title = top_10000_movies['title']
indices = pd.Series(top_10000_movies.index, index=top_10000_movies['title'])

In [None]:
def content_recommender(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    return movie_title.iloc[movie_indices]

In [None]:
content_recommender("Across the Sea of Time")

Unnamed: 0,title
148,Apollo 13
32,Wings of Courage
76,Nico Icon
97,Heidi Fleiss: Hollywood Madam
106,Catwalk
114,Anne Frank Remembered
126,Jupiter's Wife
132,Sonic Outlaws
134,From the Journals of Jean Seberg


In [None]:
movie[movie['title'] == 'Jubal']

Unnamed: 0,movieId,title_year,genres,title,year
9906,33191,Jubal (1956),Western,Jubal,1956.0


In [None]:
top_10000_movies[top_10000_movies['title'] == 'Lion King, The']

Unnamed: 0,movieId,title_year,genres,title,year
359,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,"Lion King, The",1994.0


# Using KNN


In [None]:
knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(matrix)

In [None]:
def get_recommendations(title, indices):
     idx = indices[title]
     distances, indices = knn.kneighbors(matrix[idx], n_neighbors=6)
     similar_movies = [movie_title[i] for i in indices.flatten()[1:]]
     return similar_movies

In [None]:
get_recommendations('Across the Sea of Time', indices)

['Apollo 13',
 'Wings of Courage',
 'Hoop Dreams',
 'Catwalk',
 'Hippie Revolution, The']

In [None]:
content_recommender("Across the Sea of Time")

Unnamed: 0,title
148,Apollo 13
32,Wings of Courage
76,Nico Icon
97,Heidi Fleiss: Hollywood Madam
106,Catwalk
114,Anne Frank Remembered
126,Jupiter's Wife
132,Sonic Outlaws
134,From the Journals of Jean Seberg
