In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9"
}

def fetch_soup(url, retries=3, wait=3):
    """Fetch and parse a URL with retry logic"""
    for i in range(retries):
        try:
            res = requests.get(url, headers=headers, timeout=10)
            res.raise_for_status()
            return BeautifulSoup(res.text, "html.parser")
        except Exception as e:
            print(f"⚠️ Attempt {i+1}/{retries} failed: {e}")
            time.sleep(wait)
    return None

def scrape_movies(start):
    
    url = f"https://www.imdb.com/search/title/?title_type=feature&countries=IN&languages=hi&sort=moviemeter,asc&count=25&start={start}"

    soup = fetch_soup(url)
    if soup is None:
        print(f"❌ Failed to fetch page starting at {start}")
        return []

    try:
        movie_list = soup.find("ul", class_="ipc-metadata-list").find_all("li", class_="ipc-metadata-list-summary-item")
    except:
        print("❌ Could not find movie list on page.")
        return []

    data = []
    for movie in movie_list:
        try:
            title_link = movie.find("a", class_="ipc-title-link-wrapper")
            movie_name = title_link.text.strip()
            rank = movie_name.split(".")[0]
            movie_name = ".".join(movie_name.split(".")[1:])
            movie_id = title_link["href"].split("/")[2]
            rel_year = movie.find("div", class_="sc-dc48a950-7").span.text
            duration = movie.find_all("span", class_="sc-dc48a950-8")[1].text
            rating_tag = movie.find("span", class_="ipc-rating-star--rating")
            rating = rating_tag.text.strip() if rating_tag else "N/A"

            # Details page
            movie_url = "https://www.imdb.com" + title_link["href"]
            detail_soup = fetch_soup(movie_url)
            if detail_soup is None:
                continue

            genre_div = detail_soup.find("div", class_="ipc-chip-list--baseAlt")
            if genre_div:
                genre_tags = genre_div.find_all("a", class_="ipc-chip")
                genres = [tag.text.strip() for tag in genre_tags]
                genre = ", ".join(genres)
            else:
                genre = "N/A"

            roles = detail_soup.find_all("li", class_="ipc-metadata-list__item")
            director = roles[0].find("a").text.strip()
            stars = roles[2].find_all("a")
            stars_list = [star.text.strip() for star in stars[1:]]  # skip the first one
            stars_str = ", ".join(stars_list)

            data.append({
                "Movie ID": movie_id,
                "Rank": rank,
                "Name": movie_name,
                "Year": rel_year,
                "Duration": duration,
                "Rating": rating,
                "Genres": genre,
                "Director": director,
                "Stars": stars_str
            })

        except Exception as e:
            print(f"⚠️ Error in movie block: {e}")
            continue

    return data

# 🚀 Loop over all 1 to 1000
all_movies = []
for start in range(1, 25, 25):  # 1, 26, 51, 76
    print(f"🔄 Scraping from {start}")
    movies = scrape_movies(start)
    all_movies.extend(movies)
    time.sleep(3)


# ✅ Save to CSV
df = pd.DataFrame(all_movies)
df.to_csv("imdb_hindi_1000.csv", index=False)
print("🎉 Done! Data saved to imdb_hindi_1000.csv")


🔄 Scraping from 1
⚠️ Attempt 1/3 failed: HTTPSConnectionPool(host='www.imdb.com', port=443): Read timed out.
🎉 Done! Data saved to imdb_hindi_1000.csv


In [4]:
import pandas as pd

# Load your main dataset
df_main = pd.read_excel("Nlp with images.xlsx")

# Load the image dataset
df_images = pd.read_csv("images_with_titleid.csv")

# Clean and build full img URL
df_images['img_url'] = "https://m.media-amazon.com/images/M/" + df_images['img_url']

# Merge on IMDB id
df_full = pd.merge(df_main, df_images, how="left", left_on="IMDB id", right_on="IMDB id")

# Optional: Fill missing images with default
default_img = "https://via.placeholder.com/150x220?text=No+Image"
df_full['img_url'] = df_full['img_url'].fillna(default_img)


KeyError: 'IMDB id'

In [None]:
import pandas as pd

In [None]:
df.isna().sum()

In [None]:
df = pd.read_excel("Nlp with images.xlsx")

In [None]:
df

In [None]:
df['img'] = df['img'].fillna('https://via.placeholder.com/150x220?text=No+Image')


In [None]:
print(df.columns.tolist())


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np

In [None]:
tfid=TfidfVectorizer()
vector=tfid.fit_transform(df["genre"])

In [None]:
vector[3].toarray()

In [None]:
vector[3].toarray()

In [None]:
df["Movie Title"] = df["Movie Title"].str.strip()

In [None]:
series=pd.Series(index=df["Movie Title"],data=df.index)

In [None]:
print(series.index)

In [None]:
series

In [None]:
series["Dangal"]

In [None]:
dis=linear_kernel(vector[0],vector)

In [None]:
dis

In [None]:
scores=pd.DataFrame(dis).T

In [None]:
scores.columns=["score"]

In [None]:
scores

In [None]:
x=scores.sort_values("score",ascending=False)

In [None]:
x

In [None]:
series[series==10]

In [None]:
df

In [None]:
def movie_dec(name,n):
    dis=linear_kernel(vector[series[name]],vector)
    scores=pd.DataFrame(dis).T
    scores.columns=["score"]
    scores=scores.sort_values("score",ascending=False)

    lst=[]

    for i in range(0,n+1):
        s={"Movie Title":df["Movie Title"][scores.index[i]],
           "Ratinng":df["Ratinng"][scores.index[i]],
           "Genre":df["genre"][scores.index[i]],
           "act":df["act"][scores.index[i]],
           "img":df["img"][scores.index[i]]
          }
        lst.append(s)
        
    return lst
name=input()
n=int(input())
output=movie_dec(name,n)
print(output)

In [None]:
pd.DataFrame(output)

In [None]:
import joblib
joblib.dump(output,"model.pkl")