In [48]:
# Importing Depedendancies
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import accuracy_score,classification_report
from sklearn.cluster import KMeans 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Elbow Curve to find optimal K value

In [49]:
# Reading in file 
df = pd.read_csv(Path("../cleaned_data/movie_data.csv"))
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,director,cast,country,release_year,genre_types,rating,duration,description,popularity,production_companies,writers,combined,original_title
0,0,653574,dick johnson is dead,Kirsten Johnson,,United States,2020,Documentaries,PG-13,90 min,"As her father nears the end of his life, filmm...",12.0,Big Mouth Productions,"Kirsten Johnson, Nels Bangerter",kirsten johnsonunited states2020,Dick Johnson Is Dead
1,1,597316,my little pony: a new generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",,2021,Children & Family Movies,PG,91 min,Equestria's divided. But a bright-eyed hero be...,25.85,"Boulder Media, Entertainment One","Gillian M. Berrow, Tim Sullivan","robert cullen, josé luis uchavanessa hudgens, ...",My Little Pony: A New Generation
2,2,68351,sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...",1993,"Dramas, Independent Movies, International Movies",TV-MA,125 min,"On a photo shoot in Ghana, an American model s...",3.48,"Diproci, Ghana National Commission on Culture,...",Haile Gerima,"haile gerimakofi ghanaba, oyafunmike ogunlano,...",Sankofa
3,3,468225,the starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,2021,"Comedies, Dramas",PG-13,104 min,A woman adjusting to life after a loss contend...,15.47,"Entertainment One, Boies/Schiller Film Group, ...",Matt Harris,"theodore melfimelissa mccarthy, chris o'dowd, ...",The Starling
4,4,786705,confessions of an invisible girl,Bruno Garotti,"Klara Castanho, Lucca Picon, Júlia Gomes, Marc...",,2021,"Children & Family Movies, Comedies",TV-PG,91 min,When the clever but socially-awkward Tetê join...,17.89,,Thalita Rebouças,"bruno garottiklara castanho, lucca picon, júli...",Confessions of an Invisible Girl


In [50]:
# TF-IDF Vectorization to assess importance of each word in 'combined' column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined'])

# Measuring cosine similarity 
similarity = linear_kernel(tfidf_matrix, tfidf_matrix)

In [51]:
sim_df = pd.DataFrame(similarity)
sim_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5133,5134,5135,5136,5137,5138,5139,5140,5141,5142
0,1.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
1,0.0,1.000000,0.0,0.000000,0.0,0.017302,0.0,0.0,0.0,0.014726,...,0.010539,0.026542,0.0,0.0,0.000000,0.0,0.017951,0.000000,0.011487,0.026805
2,0.0,0.000000,1.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.021701,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
3,0.0,0.000000,0.0,1.000000,0.0,0.000000,0.0,0.0,0.0,0.072947,...,0.000000,0.000000,0.0,0.0,0.033603,0.0,0.000000,0.000000,0.018319,0.000000
4,0.0,0.000000,0.0,0.000000,1.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5138,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,1.0,0.000000,0.000000,0.000000,0.000000
5139,0.0,0.017951,0.0,0.000000,0.0,0.029128,0.0,0.0,0.0,0.012914,...,0.011897,0.000000,0.0,0.0,0.000000,0.0,1.000000,0.000000,0.041980,0.000000
5140,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,1.000000,0.068168,0.000000
5141,0.0,0.011487,0.0,0.018319,0.0,0.000000,0.0,0.0,0.0,0.020960,...,0.011629,0.026785,0.0,0.0,0.000000,0.0,0.041980,0.068168,1.000000,0.000000


In [52]:
# Creating similarity score threshold for classification
threshold = 0.5

# Classifying pairs as similar (1) or dissimilar (0) based on the threshold
classification = np.where(np.array(similarity) > threshold, 1, 0)

# Reshape classification array into a square matrix (corresponding to similarity score matrix)
classification_matrix = classification.reshape((len(df), len(df)))

# Mapping classification matrix back to DataFrame
df['similarity_class'] = classification_matrix.tolist()

In [53]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, df['similarity_class'], test_size=0.2, random_state=42)

In [54]:
from sklearn.preprocessing import MaxAbsScaler

# features scaling
scaler = MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [55]:
# Initializing and training KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

# Making predictions
y_pred = knn.predict(X_test_scaled)

# Evaluating model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

ValueError: Unknown label type: unknown. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [None]:
# Creating list to store intertia values and k values
inertia = []
k = list(range(1,11))

In [None]:
# Creating for-loop to evaluate each value of K using KMeans
# Appending computed inertia value back into the list

for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(similarity)
    inertia.append(k_model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
# Create a dataframe for the k-values and inertia
new_elbow_data = {"k": k, "inertia": inertia}
elbow_df = pd.DataFrame(new_elbow_data)

elbow_df

Unnamed: 0,k,inertia
0,1,15.0
1,2,14.0
2,3,13.0
3,4,12.0
4,5,11.0
5,6,10.0
6,7,9.0
7,8,8.0
8,9,7.0
9,10,6.0


In [None]:
# Plotting elbow curve
elbow_df.hvplot.line(
    x="k",
    y="inertia",
    title="Elbow Curve",
    xticks=k
)

Have to correct this above

# Attempting again with Nearest Neighbor

Due to encoded movie genre columns creating a large number of features, this can result the model to be prone to overfitting, and can reduce unneccessary features from taking up computational space. 

In [71]:
# Earlier findings revealed optimal k-value to be 3
k = 3  

# Initiating model
knn_model = NearestNeighbors(n_neighbors=k, metric='cosine')
knn_model.fit(tfidf_matrix)

# Building function to find nearest neighbors for a given  movie
def find_nearest_neighbors(query_movie_index):
    # Calculating the nearest neighbors for the query movie
    distances, indices = knn_model.kneighbors(tfidf_matrix[query_movie_index], n_neighbors=k+1)
    # Excluding the first nearest neighbor, as it is the given movie 
    distances = distances.flatten()[1:]
    indices = indices.flatten()[1:]
    return distances, indices

# Running a model test
query_movie_index = 0  # Index of the query movie in your dataset
distances, indices = find_nearest_neighbors(query_movie_index)
print("Nearest neighbors for query movie:")
for i, idx in enumerate(indices):
    print(f"Neighbor {i+1}: Index={idx}, Similarity={1 - distances[i]}")

Nearest neighbors for query movie:
Neighbor 1: Index=1163, Similarity=0.30736987869053
Neighbor 2: Index=1543, Similarity=0.30736987869053
Neighbor 3: Index=1307, Similarity=0.30736987869053


In [70]:
print(classification_report(tfidf_matrix,y_test))

ValueError: Found input variables with inconsistent numbers of samples: [5143, 1286]