In [1]:
#All the header files required for the code
import numpy as np
import pandas as pd
from factor_analyzer import FactorAnalyzer
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import metrics
import random

In [2]:
#Importing both the file using pandas 
data1 = pd.read_csv('movies data.csv')
data2 = pd.read_csv('ratings data.csv')

In [3]:
#Deleting unnecessary columns
data1 = data1.drop('Unnamed: 0',axis = 1)
data2 = data2.drop(['Unnamed: 0','Timestamp'],axis = 1)

In [4]:
data1.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
data2.head()

Unnamed: 0,UserID,MovieID,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [6]:
#Merging both the dataframes
data = pd.merge(data2 , data1 , how='outer', on='MovieID')

In [7]:
data.head()

Unnamed: 0,UserID,MovieID,Rating,Title,Genres
0,1.0,1193,5.0,One Flew Over the Cuckoo's Nest (1975),Drama
1,2.0,1193,5.0,One Flew Over the Cuckoo's Nest (1975),Drama
2,12.0,1193,4.0,One Flew Over the Cuckoo's Nest (1975),Drama
3,15.0,1193,4.0,One Flew Over the Cuckoo's Nest (1975),Drama
4,17.0,1193,5.0,One Flew Over the Cuckoo's Nest (1975),Drama


In [8]:
# Data Processing
# Converting Genres into different columns 
# Here we just create columns and put there initial value as 0
x = data.Genres
a = list()
for i in x:
    abc = i
    a.append(abc.split('|'))
a = pd.DataFrame(a)   
b = a[0].unique()
for i in b:
    data[i] = 0
data.head(2)

Unnamed: 0,UserID,MovieID,Rating,Title,Genres,Drama,Animation,Musical,Action,Comedy,...,Thriller,Crime,Western,Documentary,Mystery,Horror,Sci-Fi,Film-Noir,War,Fantasy
0,1.0,1193,5.0,One Flew Over the Cuckoo's Nest (1975),Drama,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,1193,5.0,One Flew Over the Cuckoo's Nest (1975),Drama,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# we assign 1 to all the columns which are present in the Genres
for i in b:
    data.loc[data['Genres'].str.contains(i), i] = 1

In [10]:
data.head(2)

Unnamed: 0,UserID,MovieID,Rating,Title,Genres,Drama,Animation,Musical,Action,Comedy,...,Thriller,Crime,Western,Documentary,Mystery,Horror,Sci-Fi,Film-Noir,War,Fantasy
0,1.0,1193,5.0,One Flew Over the Cuckoo's Nest (1975),Drama,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,1193,5.0,One Flew Over the Cuckoo's Nest (1975),Drama,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Now there is no use of genre 
# Since we have movie id so there is no need for movie names as well
data = data.drop(['Genres','Title'],axis =1)
data.head()

Unnamed: 0,UserID,MovieID,Rating,Drama,Animation,Musical,Action,Comedy,Adventure,Romance,...,Thriller,Crime,Western,Documentary,Mystery,Horror,Sci-Fi,Film-Noir,War,Fantasy
0,1.0,1193,5.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,1193,5.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12.0,1193,4.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,15.0,1193,4.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17.0,1193,5.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
data.columns

Index(['UserID', 'MovieID', 'Rating', 'Drama', 'Animation', 'Musical',
       'Action', 'Comedy', 'Adventure', 'Romance', 'Children's', 'Thriller',
       'Crime', 'Western', 'Documentary', 'Mystery', 'Horror', 'Sci-Fi',
       'Film-Noir', 'War', 'Fantasy'],
      dtype='object')

In [13]:
# Because of merging some null values are created
data.isnull().sum()

UserID         177
MovieID          0
Rating         177
Drama            0
Animation        0
Musical          0
Action           0
Comedy           0
Adventure        0
Romance          0
Children's       0
Thriller         0
Crime            0
Western          0
Documentary      0
Mystery          0
Horror           0
Sci-Fi           0
Film-Noir        0
War              0
Fantasy          0
dtype: int64

In [14]:
#WE simply drop the null values coz the are not treatable
data.dropna(inplace= True )

In [15]:
data.isnull().sum()

UserID         0
MovieID        0
Rating         0
Drama          0
Animation      0
Musical        0
Action         0
Comedy         0
Adventure      0
Romance        0
Children's     0
Thriller       0
Crime          0
Western        0
Documentary    0
Mystery        0
Horror         0
Sci-Fi         0
Film-Noir      0
War            0
Fantasy        0
dtype: int64

In [16]:
#By different meathods I found 8 cluster are better 
kmeanModel = KMeans(n_clusters=8)
kmeanModel.fit(data)

KMeans()

In [17]:
# Creating an extra column in data for storing the cluster values
data['Cluster'] = kmeanModel.labels_
data['Cluster'].sample(n=10)

155090    0
715922    5
935949    1
603978    6
228063    3
870973    5
739729    5
350278    5
735622    2
553329    7
Name: Cluster, dtype: int32

In [18]:
data['Cluster'].value_counts()

7    140187
6    137499
0    125381
3    122606
4    121414
2    119936
5    119468
1    113718
Name: Cluster, dtype: int64

In [19]:
data.head()

Unnamed: 0,UserID,MovieID,Rating,Drama,Animation,Musical,Action,Comedy,Adventure,Romance,...,Crime,Western,Documentary,Mystery,Horror,Sci-Fi,Film-Noir,War,Fantasy,Cluster
0,1.0,1193,5.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,1193,5.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12.0,1193,4.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,15.0,1193,4.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17.0,1193,5.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# When we merge the dataframe for a single movie multiple rows were created so a single movie is allotted
# to many clusters so here we allot a single cluster to a movie 
# the Cluster which occurs maximum number of times is alloted to the movie  
e = []
def fi(group):
    a = pd.DataFrame(group)
    b = pd.DataFrame(a['Cluster'].value_counts())
    d = a.index 
    c = [a['MovieID'][d[0]],int(b.idxmax())]
    e.append(c)
    

In [21]:
data.groupby("MovieID").apply(lambda x: fi(x))

In [22]:
e = pd.DataFrame(e)

In [23]:
e.head()

Unnamed: 0,0,1
0,1,6
1,2,6
2,3,5
3,4,4
4,5,4


In [24]:
# I Dont know why always the column name shift according to its will :(
# Here just the column names are swapped
e.rename(columns = {0:'MovieID',1:'Cluster'},inplace=True)
e.drop_duplicates(inplace=True)

In [25]:
e.head(10)

Unnamed: 0,MovieID,Cluster
0,1,6
1,2,6
2,3,5
3,4,4
4,5,4
5,6,6
6,7,0
7,8,5
8,9,6
9,10,6


In [26]:
data1 = pd.read_csv('movies data.csv')
new_data = pd.merge(e , data1 , how='outer', on='MovieID')

In [27]:
# These were the movies we deleted while merging the file  
new_data.isnull().sum()

MovieID         0
Cluster       177
Unnamed: 0      0
Title           0
Genres          0
dtype: int64

In [28]:
# We can delete the movies but I just label them randomly :)
new_data.fillna(random.randint(0,8),inplace=True)

In [29]:
new_data.isnull().sum()

MovieID       0
Cluster       0
Unnamed: 0    0
Title         0
Genres        0
dtype: int64

In [30]:
#This function select the cluster for a user according the the user choice
def select_c():
    global l
    print('Select The Movies Id you would like to watch:')
    l=[]
    for i in range(15):
        l.append(random.randint(0,3883))
    for i in l:
        print(new_data['MovieID'][i] , new_data['Title'][i],sep='--->')
    print('--------------------------------------------------------------------')
    l = int(input())
    l = new_data['Cluster'][new_data.MovieID == l]

In [31]:
# This is the main function which recommend you movies
def main():
    ans = False
    while not ans:
        select_c()
        print(new_data['Title'][new_data.Cluster == int(l)].sample(n=10))
        print('--------------------------------------------------------------------')
        print('Do you like these movies(y/n)')
        abc = input()
        while ((abc =='y') or (abc == 'Y')):          
            print(new_data['Title'][new_data.Cluster == int(l)].sample(n=10))
            print('--------------------------------------------------------------------')
            print('Want more!!!!(y/n)')
            abc = input()
            if ((abc =='N') or (abc == 'n')):
                ans =True

In [None]:
main()


Select The Movies Id you would like to watch:
241--->Fluke (1995)
2244--->Allnighter, The (1987)
3730--->Conversation, The (1974)
497--->Much Ado About Nothing (1993)
3005--->Bone Collector, The (1999)
1301--->Forbidden Planet (1956)
122--->Boomerang (1992)
298--->Pushing Hands (1992)
1544--->Lost World: Jurassic Park, The (1997)
3496--->Madame Sousatzka (1988)
3781--->Shaft in Africa (1973)
2272--->One True Thing (1998)
2684--->Taxman (1999)
2745--->Mission, The (1986)
2307--->One Tough Cop (1998)
--------------------------------------------------------------------
