In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sqlalchemy import create_engine

df = pd.read_csv('movies_titles.csv')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,release_year,rating,duration,description,...,Language TV Shows,Musicals,Nature TV,Reality TV,Spirituality,TV Action,TV Comedies,TV Dramas,Talk Shows TV Comedies,Thrillers
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,"Michael Hilow, Ana Hoffman, Dick Johnson, Kirs...",United States,2020,PG-13,90 min,As her father nears the end of his life filmma...,...,0,0,0,0,0,0,0,0,0,0
1,s2,TV Show,Blood & Water,,Ama Qamata Khosi Ngema Gail Mabalane Thabang M...,South Africa,2021,TV-MA,2 Seasons,After crossing paths at a party a Cape Town te...,...,0,0,0,0,0,0,0,1,0,0
2,s3,TV Show,Ganglands,Julien Leclercq,Sami Bouajila Tracy Gotoas Samuel Jouy Nabiha ...,,2021,TV-MA,1 Season,To protect his family from a powerful drug lor...,...,0,0,0,0,0,1,0,0,0,0
3,s4,TV Show,Jailbirds New Orleans,,,,2021,TV-MA,1 Season,Feuds flirtations and toilet talk go down amon...,...,0,0,0,1,0,0,0,0,0,0
4,s5,TV Show,Kota Factory,,Mayur More Jitendra Kumar Ranjan Raj Alam Khan...,India,2021,TV-MA,2 Seasons,In a city of coaching centers known to train I...,...,0,0,0,0,0,0,1,0,0,0


In [2]:
df.isna().sum()
df.shape

(8508, 43)

### Create the TFID vecoritzer and take out the stopwords

In [4]:
# Fill missing values with empty strings to prevent errors
df['cast'] = df['cast'].fillna('')
df['director'] = df['director'].fillna('')
df['description'] = df['description'].fillna('')
df['rating'] = df['rating'].fillna('')
df['genre'] = df.apply(lambda row: ' '.join([col for col in df.columns if df.columns.get_loc(col) >= 11 and row[col] == 1]), axis=1)

# Create a "soup" of combined text features
df['soup'] = df['cast'] + ' ' + df['director'] + ' ' + df['description'] + ' ' + df['genre'] + ' ' + df['rating']

# Create a TfidfVectorizer and Remove stopwords
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the data to a tfidf matrix
tfidf_matrix = tfidf.fit_transform(df['soup'])

# Print the shape of the tfidf_matrix
print(tfidf_matrix.shape)

# Preview the matrix by placing it into a DataFrame (which we won't need later)
df_tfidf = pd.DataFrame(tfidf_matrix.T.todense(), index=tfidf.get_feature_names_out(), columns=df['soup'])
df_tfidf

(8508, 50542)


soup,"Michael Hilow, Ana Hoffman, Dick Johnson, Kirsten Johnson, Chad Knorr Kirsten Johnson As her father nears the end of his life filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable. Documentaries PG-13",Ama Qamata Khosi Ngema Gail Mabalane Thabang Molaba Dillon Windvogel Natasha Thahane Arno Greeff Xolile Tshabalala Getmore Sithole Cindy Mahlangu Ryle De Morny Greteli Fincham Sello Maake Ka-Ncube Odwa Gwanya Mekaila Mathys Sandi Schultz Duane Williams Shamilla Miller Patrick Mofokeng After crossing paths at a party a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth. Dramas TV Dramas TV-MA,Sami Bouajila Tracy Gotoas Samuel Jouy Nabiha Akkari Sofia Lesaffre Salim Kechiouche Noureddine Farihi Geert Van Rampelberg Bakary Diombera Julien Leclercq To protect his family from a powerful drug lord skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war. Action Adventure TV Action TV-MA,Feuds flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty reality series. Docuseries Reality TV TV-MA,Mayur More Jitendra Kumar Ranjan Raj Alam Khan Ahsaas Channa Revathi Pillai Urvi Singh Arun Kumar In a city of coaching centers known to train India’s finest collegiate minds an earnest but unexceptional student and his friends navigate campus life. Comedies TV Comedies TV-MA,Kate Siegel Zach Gilford Hamish Linklater Henry Thomas Kristin Lehman Samantha Sloyan Igby Rigney Rahul Kohli Annarah Cymone Annabeth Gish Alex Essoe Rahul Abburi Matt Biedel Michael Trucco Crystal Balint Louis Oliver Mike Flanagan The arrival of a charismatic young priest brings glorious miracles ominous mysteries and renewed religious fervor to a dying town desperate to believe. Dramas TV Dramas TV-MA,Vanessa Hudgens Kimiko Glenn James Marsden Sofia Carson Liza Koshy Ken Jeong Elizabeth Perkins Jane Krakowski Michael McKean Phil LaMarr Robert Cullen José Luis Ucha Equestria's divided. But a bright-eyed hero believes Earth Ponies Pegasi and Unicorns should be pals — and hoof to heart she’s determined to prove it. Children Family Movies PG,Kofi Ghanaba Oyafunmike Ogunlano Alexandra Duah Nick Medley Mutabaruka Afemo Omilami Reggie Carter Mzuri Haile Gerima On a photo shoot in Ghana an American model slips back in time becomes enslaved on a plantation and bears witness to the agony of her ancestral past. Dramas TV-MA,Mel Giedroyc Sue Perkins Mary Berry Paul Hollywood Andy Devonshire A talented batch of amateur bakers face off in a 10-week competition whipping up their best dishes in the hopes of being named the U.K.'s best. Reality TV TV-14,Melissa McCarthy Chris O'Dowd Kevin Kline Timothy Olyphant Daveed Diggs Skyler Gisondo Laura Harrier Rosalind Chao Kimberly Quinn Loretta Devine Ravi Kapoor Theodore Melfi A woman adjusting to life after a loss contends with a feisty bird that's taken over her garden — and a husband who's struggling to find a way forward. Comedies Dramas PG-13,...,Michael Johnston Jessica Gee-George Christine Marie Cabanos Christopher Smith Max Mittelman Reba Buhr Kyle Hebert Teen surfer Zak Storm is mysteriously transported to the Bermuda Triangle where he becomes the captain of a magical ship full of misfits. Kids' TV TV-Y7,Adil Hussain Mona Singh K.K. Raina Sanjay Mishra Anil Rastogi Ravi Jhankal Kulbhushan Kharbanda Ekavali Khanna Mukesh Tiwari Vinod Acharya Chandra Prakash Dwivedi A philandering small-town mechanic's political ambitions are sparked when the visiting prime minister mistakenly grants him special security clearance. Comedies Comedies Dramas International Movies Dramas Dramas International Movies TV-MA,Santosh Juvekar Siddharth Chandekar Sachit Patil Chinmay Mandlekar Rajesh Shringarpure Pushkar Shrotri Tejashree Pradhan Neha Joshi Avadhoot Gupte A change in the leadership of a political party sparks bitter conflict and the party's division into two rival factions. Dramas Dramas International Movies TV-14,Sanam Saeed Fawad Khan Ayesha Omer Mehreen Raheel Sheheryar Munawar Samina Peerzada Waseem Abbas Javed Sheikh Hina Khawaja Bayat Strong-willed middle-class Kashaf and carefree wealthy Zaroon meet in college but before love can take root they each have some growing up to do. Dramas International TV Shows Romantic TV Shows TV Dramas TV Dramas TV-PG,Ali Suliman Saleh Bakri Yasa Ali Al-Jabri Mansoor Alfeeli Ahd Majid Al Ansari Recovering alcoholic Talal wakes up inside a small-town police station cell where he's subject to the mind games of a psychotic sadist. Dramas Dramas International Movies International Movies Thrillers Thrillers TV-MA,Mark Ruffalo Jake Gyllenhaal Robert Downey Jr. Anthony Edwards Brian Cox Elias Koteas Donal Logue John Carroll Lynch Dermot Mulroney Chloë Sevigny David Fincher A political cartoonist a crime reporter and a pair of cops investigate San Francisco's infamous Zodiac Killer in this thriller based on a true story. Dramas Thrillers R,While living alone in a spooky town a young girl befriends a motley crew of zombie children with diverse personalities. Comedies Kids' TV TV Comedies TV-Y7,Jesse Eisenberg Woody Harrelson Emma Stone Abigail Breslin Amber Heard Bill Murray Derek Graf Ruben Fleischer Looking to survive in a world taken over by zombies a dorky college student teams with an urban roughneck and a pair of grifter sisters. Comedies Horror Movies R,Tim Allen Courteney Cox Chevy Chase Kate Mara Ryan Newman Michael Cassidy Spencer Breslin Rip Torn Kevin Zegers Peter Hewitt Dragged from civilian life a former superhero must train a new crop of youthful saviors when the military preps for an attack by a familiar villain. Children Comedies Family Movies PG,Vicky Kaushal Sarah-Jane Dias Raaghav Chanana Manish Chaudhary Meghna Malik Malkeet Rauni Anita Shabdish Chittaranjan Tripathy Mozez Singh A scrappy but poor boy worms his way into a tycoon's dysfunctional family while facing his fear of music and the truth about his past. Dramas Dramas International Movies Musicals TV-14
007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183768,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
źak,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
żulewska,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
żurawski,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
żygadło,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Find the cosine similarity scores and put them in a df

In [5]:
# Compute the cosine similarity between each movie description
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# For easier viewing, put it in a dataframe
df_results = pd.DataFrame(cosine_sim, columns=df['show_id'], index = df['show_id'])
# df_results.to_csv("content_filtering_recommendations.csv")
# df_results.sort_values(by=['s8'], ascending=False)
# df_results

### Build the csv file that holds all the recommendations

In [6]:
# Create a DataFrame to hold the top 20 recommendations for each show
top_n = 20
top_recommendations = {}

for show_id in df['show_id']:
    similar_scores = df_results[show_id].drop(show_id)  # Drop self-match
    top_similar = similar_scores.sort_values(ascending=False).head(top_n).index.tolist()
    top_recommendations[show_id] = top_similar

# Convert to a DataFrame
recommendations_df = pd.DataFrame.from_dict(top_recommendations, orient='index')
recommendations_df.index.name = 'show_id'
recommendations_df = recommendations_df.reset_index()

In [7]:
recommendations_df

Unnamed: 0,show_id,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,s1,s7016,s3336,s7455,s3718,s7623,s3928,s4115,s8043,s2351,...,s7242,s7482,s6661,s6896,s8068,s6554,s4623,s8691,s1027,s130
1,s2,s1515,s1594,s1885,s4476,s4210,s5345,s109,s7092,s5850,...,s5944,s1906,s7263,s5874,s4949,s7545,s2905,s7490,s3841,s6258
2,s3,s2669,s3298,s3426,s425,s6593,s5481,s4004,s5114,s3977,...,s6809,s6741,s4111,s5489,s1906,s750,s4164,s2922,s3843,s7193
3,s4,s6361,s2346,s3838,s7868,s1466,s2395,s1693,s4808,s4710,...,s5055,s8465,s6764,s2923,s2814,s7436,s8756,s185,s1392,s2133
4,s5,s2354,s8776,s3467,s8125,s752,s2473,s2722,s4734,s3465,...,s2287,s5501,s7525,s4439,s7941,s4608,s7168,s2739,s2757,s210
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8503,s8803,s1359,s3967,s6887,s3008,s6499,s4617,s6736,s8512,s6033,...,s5953,s8072,s7999,s2792,s3478,s8744,s6201,s8161,s6848,s8004
8504,s8804,s6250,s1865,s1953,s2972,s3147,s2882,s752,s7287,s3226,...,s270,s2642,s407,s1309,s4810,s3954,s5423,s6933,s3602,s847
8505,s8805,s6013,s7211,s8495,s7585,s7593,s3344,s8289,s1842,s3325,...,s1114,s1961,s3858,s1265,s8429,s6457,s354,s1486,s1614,s7547
8506,s8806,s4246,s6986,s6807,s6953,s1637,s6640,s4206,s4245,s4657,...,s6272,s1405,s2719,s7593,s1856,s7019,s7205,s1470,s5166,s7971


### Put it in the Azure database

In [8]:
# Define your Azure SQL connection details
server = '1-10intex.database.windows.net'
database = 'Movies'
username = 'pigadmin'
password = 'Superduperstrongpassword!'
driver = 'ODBC Driver 18 for SQL Server'  # Or another valid installed driver

# Encode the driver string for URL
driver_encoded = driver.replace(' ', '+')

# Create the connection URL
connection_string = f'mssql+pyodbc://{username}:{password}@{server}:1433/{database}?driver={driver_encoded}&Encrypt=yes&TrustServerCertificate=no&Connection Timeout=30'

# Create SQLAlchemy engine
engine = create_engine(connection_string)

# Upload DataFrame to Azure
recommendations_df.to_sql(
    'content_filtering_recommendations',
    con=engine,
    if_exists='replace',  # or 'append' if you want to add new data without replacing
    index=False,
    chunksize=1000
)

93