In [1]:
from collections import Counter
import time

import pandas as pd
import requests
from IPython.display import display
from threading import Thread
from flask import Flask, request, jsonify
from sklearn.preprocessing import MultiLabelBinarizer
from fastapi import FastAPI, Query, HTTPException
from pydantic import BaseModel
from typing import List, Optional
import uvicorn

In [2]:
# Задание 1
df = pd.read_csv(
    r"C:\Users\Normc\OneDrive\Рабочий стол\Python\Datasets\imdb_movies.csv", 
    encoding='utf-8',
    )

cols_to_drop = df.columns[[4 , 6, 7, 8, 9, 10, 11]]  # список названий столбцов по индексам
df = df.drop(columns=cols_to_drop)

display(df.head())
display(df.info())
df.isna().sum()

Unnamed: 0,names,date_x,score,genre,crew
0,Creed III,03/02/2023,73.0,"Drama, Action","Michael B. Jordan, Adonis Creed, Tessa Thompso..."
1,Avatar: The Way of Water,12/15/2022,78.0,"Science Fiction, Adventure, Action","Sam Worthington, Jake Sully, Zoe Saldaña, Neyt..."
2,The Super Mario Bros. Movie,04/05/2023,76.0,"Animation, Adventure, Family, Fantasy, Comedy","Chris Pratt, Mario (voice), Anya Taylor-Joy, P..."
3,Mummies,01/05/2023,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Óscar Barberán, Thut (voice), Ana Esther Albor..."
4,Supercell,03/17/2023,61.0,Action,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10178 entries, 0 to 10177
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   names   10178 non-null  object 
 1   date_x  10178 non-null  object 
 2   score   10178 non-null  float64
 3   genre   10093 non-null  object 
 4   crew    10122 non-null  object 
dtypes: float64(1), object(4)
memory usage: 397.7+ KB


None

names      0
date_x     0
score      0
genre     85
crew      56
dtype: int64

Столбец Title, Genre, Language будем кодировать OHE, т.к у нас нет разницы в важности названия фильма, его жанра или языка, на котором вышел фильм

In [3]:
# Задание 2
df['date_x'] = pd.to_datetime(df['date_x'], errors='coerce').dt.year
df['score'] = df['score'].astype(int)
df['film_id'] = range(1, len(df)+1)
df = df.dropna(subset=['genre'])

df['genre_list'] = df['genre'].apply(lambda x: [g.strip() for g in str(x).split(',')])
df['crew_list'] = df['crew'].apply(
    lambda x: [g.strip() for g in str(x).split(', ') if len(g.strip().split()) > 1] if pd.notna(x) else []
)

genre_dummies = df['genre_list'].explode().str.get_dummies().groupby(level=0).sum()

crew_flat = [actor for sublist in df['crew_list'] for actor in sublist]
top_100_crew = [actor for actor, _ in Counter(crew_flat).most_common(500)]
genre_columns = genre_dummies.columns.tolist()

df_top_crew = df['crew_list'].apply(lambda x: [a for a in x if a in top_100_crew])
crew_dummies = df_top_crew.explode().str.get_dummies().groupby(level=0).sum()
actor_columns = crew_dummies.columns.tolist()

df = pd.concat([df, genre_dummies, crew_dummies], axis=1)
df = df.drop(columns=['genre', 'genre_list', 'crew', 'crew_list'])
df = df.rename(columns={'names': 'title', 'date_x': 'year', 'score': 'rating'})
df.head()

Unnamed: 0,title,year,rating,film_id,Action,Adventure,Animation,Comedy,Crime,Documentary,...,William Fichtner,William H. Macy,William Hurt,Winona Ryder,Woody Harrelson,Yuki Kaji,Yuriko Yamaguchi,Zac Efron,Zach Galifianakis,Zoe Saldaña
0,Creed III,2023,73,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Avatar: The Way of Water,2022,78,2,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,The Super Mario Bros. Movie,2023,76,3,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Mummies,2023,70,4,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Supercell,2023,61,5,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Задание 3
app = Flask(__name__)
app.config['JSON_AS_ASCII'] = False  

def film_to_dict(row):
    film_dict = {
        "film_id": int(row['film_id']),
        "title": row['title'],
        "year": int(row['year']) if pd.notna(row['year']) else None,
        "rating": float(row['rating']),
        "genres": [genre for genre in genre_columns if row[genre] == 1]
    }

    actors = [actor for actor in actor_columns if row[actor] == 1]
    if actors:
        film_dict["actors"] = actors
    return film_dict

@app.route('/films', methods=['GET'])
def get_films():
    limit = request.args.get('limit', type=int)
    genre = request.args.get('genre')
    year_from = request.args.get('year_from', type=int)
    year_to = request.args.get('year_to', type=int)

    data = df.copy()

    if genre:
        if genre not in genre_columns:
            return jsonify({"error": "Invalid genre"}), 400
        data = data[data[genre] == 1]

    if year_from:
        data = data[data['year'] >= year_from]

    if year_to:
        data = data[data['year'] <= year_to]

    if limit:
        data = data.head(limit)

    films = data.apply(film_to_dict, axis=1).tolist()
    return jsonify(films)

@app.route('/films/search', methods=['GET'])
def search_films():
    query = request.args.get('query')
    if not query:
        return jsonify({"error": "query parameter is required"}), 400
    
    mask = df['title'].str.contains(query, case=False, na=False)
    films = df[mask].apply(film_to_dict, axis=1).tolist()
    return jsonify(films)

@app.route('/films/avg_rating', methods=['GET'])
def avg_rating():
    avg = df['rating'].mean()
    return jsonify({"average_rating": round(avg, 2)})

@app.route('/films/top_rated', methods=['GET'])
def top_rated():
    limit = request.args.get('limit', default=10, type=int)
    top = df.nlargest(limit, 'rating')
    films = top.apply(film_to_dict, axis=1).tolist()
    return jsonify(films)

@app.route('/films/<int:film_id>', methods=['GET'])
def get_film_by_id(film_id):
    film = df[df['film_id'] == film_id]
    if film.empty:
        return jsonify({"error": "Film not found"}), 404
    
    return jsonify(film_to_dict(film.iloc[0]))

def run_app():
    app.run(host='127.0.0.1', debug=False, use_reloader=False)

thread = Thread(target=run_app)
thread.daemon = True
thread.start()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [22/Dec/2025 17:30:10] "GET /films?limit=5 HTTP/1.1" 200 -
127.0.0.1 - - [22/Dec/2025 17:30:13] "GET /films?genre=Action HTTP/1.1" 200 -
127.0.0.1 - - [22/Dec/2025 17:30:14] "GET /films?year_from=2022&year_to=2023 HTTP/1.1" 200 -
127.0.0.1 - - [22/Dec/2025 17:30:14] "GET /films/search?query=Avatar HTTP/1.1" 200 -
127.0.0.1 - - [22/Dec/2025 17:30:14] "GET /films/avg_rating HTTP/1.1" 200 -
127.0.0.1 - - [22/Dec/2025 17:30:14] "GET /films/top_rated?limit=5 HTTP/1.1" 200 -
127.0.0.1 - - [22/Dec/2025 17:30:14] "GET /films/19 HTTP/1.1" 200 -


In [5]:
#Проверки для Flask
URL = "http://127.0.0.1:5000"

resp = requests.get(f"{URL}/films?limit=5")
print("/films?limit=5")
print(resp.json())

resp = requests.get(f"{URL}/films?genre=Action")
print("\n/films?genre=Action")
print(resp.json())

resp = requests.get(f"{URL}/films?year_from=2022&year_to=2023")
print("\n/films?year_from=2022&year_to=2023")
print(resp.json())

resp = requests.get(f"{URL}/films/search?query=Avatar")
print("\n/films/search?query=Avatar")
print(resp.json())

resp = requests.get(f"{URL}/films/avg_rating")
print("\n/films/avg_rating")
print(resp.json())

resp = requests.get(f"{URL}/films/top_rated?limit=5")
print("\nfilms/top_rated?limit=5")
print(resp.json())

resp = requests.get(f"{URL}/films/19")
print("\n/films/19")
print(resp.json())

/films?limit=5
[{'film_id': 1, 'genres': ['Action', 'Drama'], 'rating': 73.0, 'title': 'Creed III', 'year': 2023}, {'actors': ['Cliff Curtis', 'Kate Winslet', 'Sam Worthington', 'Sigourney Weaver', 'Stephen Lang', 'Zoe Saldaña'], 'film_id': 2, 'genres': ['Action', 'Adventure', 'Science Fiction'], 'rating': 78.0, 'title': 'Avatar: The Way of Water', 'year': 2022}, {'actors': ['Chris Pratt', 'Jack Black', 'Keegan-Michael Key', 'Kevin Michael Richardson', 'Seth Rogen'], 'film_id': 3, 'genres': ['Adventure', 'Animation', 'Comedy', 'Family', 'Fantasy'], 'rating': 76.0, 'title': 'The Super Mario Bros. Movie', 'year': 2023}, {'film_id': 4, 'genres': ['Adventure', 'Animation', 'Comedy', 'Family', 'Fantasy'], 'rating': 70.0, 'title': 'Mummies', 'year': 2023}, {'actors': ['Alec Baldwin'], 'film_id': 5, 'genres': ['Action'], 'rating': 61.0, 'title': 'Supercell', 'year': 2023}]

/films?genre=Action
[{'film_id': 1, 'genres': ['Action', 'Drama'], 'rating': 73.0, 'title': 'Creed III', 'year': 2023}, 

In [None]:
#Задание 4
class Film(BaseModel):
    film_id: int
    genres: List[str]
    rating: float
    title: str
    year: Optional[int] = None
    actors: Optional[List[str]] = None

class AvgRating(BaseModel):
    average_rating: float

app = FastAPI(
    title="IMDB Films API",
    description="API для работы с фильмами из IMDB датасета. Поддерживает фильтры по жанру, году и поиск",
    version="1.0.0"
)

def film_to_dict(row) -> dict:
    return {
        "film_id": int(row['film_id']),
        "title": row['title'],
        "year": int(row['year']) if pd.notna(row['year']) else None,
        "rating": float(row['rating']),
        "genres": [genre for genre in genre_columns if row[genre] == 1],
        "actors": [actor for actor in actor_columns if row[actor] == 1] or None
    }

@app.get("/films", response_model=List[Film])
async def get_films(
    limit: Optional[int] = Query(None, ge=1, description="Ограничение количества фильмов"),
    genre: Optional[str] = Query(None, description="Фильтр по жанру"),
    year_from: Optional[int] = Query(None, ge=1900, description="Минимальный год выпуска"),
    year_to: Optional[int] = Query(None, le=2025, description="Максимальный год выпуска")
):
    """
    Возвращает список фильмов с фильтрами по лимиту, жанру и годам
    """
    data = df.copy()
    
    if genre:
        if genre not in genre_columns:
            raise HTTPException(status_code=400, detail="Неверный жанр")
        data = data[data[genre] == 1]
    
    if year_from:
        data = data[data['year'] >= year_from]
    if year_to:
        data = data[data['year'] <= year_to]
    
    if limit:
        data = data.head(limit)
    
    films = data.apply(film_to_dict, axis=1).tolist()
    return films

@app.get("/films/search", response_model=List[Film])
async def search_films(
    query: str = Query(..., min_length=1, description="Строка для поиска в названии фильма")
):
    """
    Поиск фильмов по подстроке в названии
    """
    mask = df['title'].str.contains(query, case=False, na=False)
    films = df[mask].apply(film_to_dict, axis=1).tolist()
    return films

@app.get("/films/avg_rating", response_model=AvgRating)
async def avg_rating():
    """
    Возвращает средний рейтинг всех фильмов
    """
    avg = df['rating'].mean()
    return {"average_rating": round(avg, 2)}

@app.get("/films/top_rated", response_model=List[Film])
async def top_rated(
    limit: int = Query(10, ge=1, le=100, description="Количество топ фильмов")
):
    """
    Возвращает топ фильмов по рейтингу
    """
    top = df.nlargest(limit, 'rating')
    films = top.apply(film_to_dict, axis=1).tolist()
    return films

@app.get("/films/{film_id}", response_model=Film)
async def get_film_by_id(film_id: int):
    """
    Возвращает детали фильма по ID
    """
    film = df[df['film_id'] == film_id]
    if film.empty:
        raise HTTPException(status_code=404, detail="Фильм не найден")
    return film_to_dict(film.iloc[0])


def run_app():
    uvicorn.run(app, host="127.0.0.1", port=7001)

thread = Thread(target=run_app)
thread.daemon = True
thread.start()


INFO:     Started server process [11372]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:7001 (Press CTRL+C to quit)


INFO:     127.0.0.1:61889 - "GET /films?limit=5 HTTP/1.1" 200 OK
INFO:     127.0.0.1:61892 - "GET /films?genre=Action HTTP/1.1" 200 OK
INFO:     127.0.0.1:61908 - "GET /films?year_from=2022&year_to=2023 HTTP/1.1" 200 OK
INFO:     127.0.0.1:61909 - "GET /films/search?query=Avatar HTTP/1.1" 200 OK
INFO:     127.0.0.1:61910 - "GET /films/avg_rating HTTP/1.1" 200 OK
INFO:     127.0.0.1:61911 - "GET /films/top_rated?limit=5 HTTP/1.1" 200 OK
INFO:     127.0.0.1:61912 - "GET /films/19 HTTP/1.1" 200 OK


In [7]:
#Проверки для FastApi
URL = "http://127.0.0.1:7001"

resp = requests.get(f"{URL}/films?limit=5")
print("/films?limit=5")
print(resp.json())

resp = requests.get(f"{URL}/films?genre=Action")
print("\n/films?genre=Action")
print(resp.json())

resp = requests.get(f"{URL}/films?year_from=2022&year_to=2023")
print("\n/films?year_from=2022&year_to=2023")
print(resp.json())

resp = requests.get(f"{URL}/films/search?query=Avatar")
print("\n/films/search?query=Avatar")
print(resp.json())

resp = requests.get(f"{URL}/films/avg_rating")
print("\n/films/avg_rating")
print(resp.json())

resp = requests.get(f"{URL}/films/top_rated?limit=5")
print("\nfilms/top_rated?limit=5")
print(resp.json())

resp = requests.get(f"{URL}/films/19")
print("\n/films/19")
print(resp.json())

/films?limit=5
[{'film_id': 1, 'genres': ['Action', 'Drama'], 'rating': 73.0, 'title': 'Creed III', 'year': 2023, 'actors': None}, {'film_id': 2, 'genres': ['Action', 'Adventure', 'Science Fiction'], 'rating': 78.0, 'title': 'Avatar: The Way of Water', 'year': 2022, 'actors': ['Cliff Curtis', 'Kate Winslet', 'Sam Worthington', 'Sigourney Weaver', 'Stephen Lang', 'Zoe Saldaña']}, {'film_id': 3, 'genres': ['Adventure', 'Animation', 'Comedy', 'Family', 'Fantasy'], 'rating': 76.0, 'title': 'The Super Mario Bros. Movie', 'year': 2023, 'actors': ['Chris Pratt', 'Jack Black', 'Keegan-Michael Key', 'Kevin Michael Richardson', 'Seth Rogen']}, {'film_id': 4, 'genres': ['Adventure', 'Animation', 'Comedy', 'Family', 'Fantasy'], 'rating': 70.0, 'title': 'Mummies', 'year': 2023, 'actors': None}, {'film_id': 5, 'genres': ['Action'], 'rating': 61.0, 'title': 'Supercell', 'year': 2023, 'actors': ['Alec Baldwin']}]

/films?genre=Action
[{'film_id': 1, 'genres': ['Action', 'Drama'], 'rating': 73.0, 'titl