In [25]:
import pandas as pd

# Step 1: Load the dataset from a CSV file.
df = pd.read_csv("luma1.csv")

# Step 2: Check and filter the dataset
# Keep only rows where the 'description' column is not NaN
df = df[df['description'].notna()]

# Step 3: Convert 'description' column to string
df['description'] = df['description'].apply(lambda x: str(x))

# Step 4: Strip leading/trailing whitespace
df['description'] = df['description'].apply(lambda x: x.strip() if isinstance(x, str) else x)

# Step 5: Convert all text to lowercase using .apply() witha lambda function to convert text to lowercase.
df['description'] = df['description'].apply(lambda x: x.lower() if isinstance(x, str) else x)

# Step 6: Handle unwanted characters using regular expressions
import re
df['description'] = df['description'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

# Step 7: Check the processed dataset, by displaying the fistr few records.
print(df.head())


  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                                                                                                                                                                                                                                                                              cast  \
0                                                                                                                                                                                                                                                                                                              NaN   
1  Ama 

**TF-IDF vectorization**

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Initialize the TF-IDF Vectorizer
# I have used TfidfVectorizer to transform text data into a vector form.
# It takes the descriptions and converts them into a sparse matrix of TF-IDF features.
# Removing English stop words for better quality
vectorizer = TfidfVectorizer(stop_words='english')

# Step 2: Fit the TF-IDF Vectorizer to the description column
# This learns the vocabulary and transforms the text data into vectors (TF-IDF values).
tfidf_matrix = vectorizer.fit_transform(df['description'])

# Step 3: Convert the result into a DataFrame, which results in good visualization
# Convert the sparse matrix into a DataFrame with words as columns.
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Step 4: View the transformed TF-IDF matrix, to understanf the structure of the vectors
print(tfidf_df.head())


   abducted  abilities  aboard  abroad  abstinence  abusive  academy  accept  \
0  0.000000        0.0     0.0     0.0         0.0      0.0      0.0     0.0   
1  0.293018        0.0     0.0     0.0         0.0      0.0      0.0     0.0   
2  0.000000        0.0     0.0     0.0         0.0      0.0      0.0     0.0   
3  0.000000        0.0     0.0     0.0         0.0      0.0      0.0     0.0   
4  0.000000        0.0     0.0     0.0         0.0      0.0      0.0     0.0   

   accident  accused  ...  wyoming  year  yearold  years  york  young  \
0       0.0      0.0  ...      0.0   0.0      0.0    0.0   0.0    0.0   
1       0.0      0.0  ...      0.0   0.0      0.0    0.0   0.0    0.0   
2       0.0      0.0  ...      0.0   0.0      0.0    0.0   0.0    0.0   
3       0.0      0.0  ...      0.0   0.0      0.0    0.0   0.0    0.0   
4       0.0      0.0  ...      0.0   0.0      0.0    0.0   0.0    0.0   

   youtube  zack  zant  zoo  
0      0.0   0.0   0.0  0.0  
1      0.0   0.0   0

In [27]:
# Function to recommend top N similar items based on user query
def recommend_items(query, df, tfidf_matrix, vectorizer, top_n=5):
    # Compute similarity for each description in the dataset
    similarities = compute_similarity(query, tfidf_matrix, vectorizer)

    # Add similarity scores to the DataFrame
    df['similarity'] = similarities

    # Sort by similarity in descending order and return top N recommendations
    recommendations = df.sort_values(by='similarity', ascending=False).head(top_n)

    # Print recommendations
    print("\nTop Recommendations:")

    return recommendations[['title', 'description', 'similarity']]


In [28]:
query = "I love thrilling action movies set in space, with a comedic twist."
recommend_items(query, df, tfidf_matrix, vectorizer, top_n=5)



Top Recommendations:


Unnamed: 0,title,description,similarity
72,A StoryBots Space Adventure,join the storybots and the space travelers of the historic inspiration mission as they search for answers to kids questions about space,0.224434
7,Guardians of the Galaxy,an action packed epic space adventure marvels guardians of the galaxy expands the marvel cinematic universe into the cosmos where brash adventurer peter quill finds himself the object of an unrelenting bounty hunt after stealing a mysterious orb,0.170441
189,The Ingenuity of the Househusband,a tough guy with a knack for housework tackles household tasks with meticulous care in these comedic liveaction vignettes,0.147954
199,I Heart Arlo,its a whole new world for arlo and his oneofa kind pals when they set out to restore a rundown new york city neighborhood and make it their own,0.128738
70,Stories by Rabindranath Tagore,the writings of nobel prize winner rabindranath tagore come to life in this collection of tales set in earlythcentury bengal,0.127045


I am expecting $25-30/hr as mentioned in the job description and I am open to negotiation as well.
Thanks