In [98]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from tqdm import tqdm # 터미널에서 진행상황을 알려주는 라이브러리 
import time 



In [99]:
# imdb 영화 상세화면 url 추가 함수: 예) https://www.imdb.com/title/tt0114709/

def add_url(row):
    return f'https://www.imdb.com/title/tt{row}/'

# add rating 함수 작성 
def add_rating(df):
    ratings_df = pd.read_csv('movie_data/ratings.csv')
    ratings_df['movieId']= ratings_df['movieId'].astype(str)
    
# imdb 영화 상세화면 url 추가 함수: 예) https://www.imdb.com/title/tt0114709/
# groupby 참조: https://velog.io/@euisuk-chung/%ED%8C%8C%EC%9D%B4%EC%8D%AC-%EB%A7%88%EC%8A%A4%ED%84%B0%ED%95%98%EA%B8%B0-GROUP-BY
# agg 참조: https://ysyblog.tistory.com/301
    agg_df = ratings_df.groupby('movieId').agg(
        rating_count=('rating', 'count'),
        rating_avg=('rating', 'mean')
    ).reset_index()

    agg_df['rating_avg'] = agg_df['rating_avg'].round(2)

    rating_added_df = df.merge(agg_df, on='movieId' , how='left')
    return rating_added_df

def add_poster(df):
    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
        tmdb_id=row['tmdbId']
        tmdb_url=f"https://api.themoviedb.org/3/movie/{tmdb_id}?api_key=c67745c88102e9fe38714f12534e29cd&language=en-US"
        result=requests.get(tmdb_url)
        
        try:
            df.at[i, 'poster_path'] = "https://image.tmdb.org/t/p/original" + result.json()['poster_path']
            time.sleep(0.1) # 각각 데이터가 0.1초마다 생성 
        except (TypeError, KeyError) as e:
            df.at[i, "poster_path"] = "https://image.tmdb.org/t/p/original/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg"
    return df


if __name__ == '__main__': # 파이선 파일이 실행되쓸떄 호출 
    movies_df = pd.read_csv('movie_data/movies.csv')
    links_df = pd.read_csv('movie_data/links.csv', dtype=str)
        
    # print (movies_df)
    # astype() 안에 dictionary 형태로 컬럼 명과 바꿀 데이터 테입을 입력 
    movies_df['movieId'] = movies_df['movieId'].astype(str)

    # 2개의 데이터 셋을 조인으로 합치기 on 은 어떤 컬럼 merge 할지 joinkey 설정 how 는 왼쪽으로 movie_df 기준 머지 실행 

    merged_df = movies_df.merge(links_df, on='movieId', how='left')


    # url 추가 적용 

    merged_df['url'] = merged_df['imdbId'].apply(lambda x: add_url(x))

    # result_df = add_rating(merged_df)
    # result_df['poster_path'] = None
    # result_df = add_poster(result_df)
    
    # result_df.to_csv('movie_data/movie_final.csv', index=None)
    

In [100]:
# print(merged_df.iloc[0,:])
print(merged_df.shape)

(9742, 6)


In [101]:
ratings_df = pd.read_csv('movie_data/ratings.csv')
ratings_df['movieId']= ratings_df['movieId'].astype(str)

agg_df = ratings_df.groupby('movieId').agg(
    rating_count=('rating', 'count'),
    rating_avg=('rating', 'mean')
).reset_index()

print(ratings_df)

        userId movieId  rating   timestamp
0            1       1     4.0   964982703
1            1       3     4.0   964981247
2            1       6     4.0   964982224
3            1      47     5.0   964983815
4            1      50     5.0   964982931
...        ...     ...     ...         ...
100831     610  166534     4.0  1493848402
100832     610  168248     5.0  1493850091
100833     610  168250     5.0  1494273047
100834     610  168252     5.0  1493846352
100835     610  170875     3.0  1493846415

[100836 rows x 4 columns]


In [102]:
# TMDB API: c67745c88102e9fe38714f12534e29cd
# https://api.themoviedb.org/3/movie/{tmdb_id}?api_key=c67745c88102e9fe38714f12534e29cd&language=en-US


In [103]:
final_df = pd.read_csv('movie_data/movie_final.csv')


In [104]:
item_frame = 'movie_data/movie_final.csv'

def random_items(n):
    movies_df = pd.read_csv(item_frame)
    movies_df = movies_df.fillna('') #결측치 공백처리 
    result_items = movies_df.sample(n=n).to_dict('records')
    return result_items

random_items(3)

[{'movieId': 32139,
  'title': 'Agony and the Ecstasy, The (1965)',
  'genres': 'Drama',
  'imdbId': 58886,
  'tmdbId': 36815.0,
  'url': 'https://www.imdb.com/title/tt0058886',
  'rating_count': 2.0,
  'rating_avg': 4.0,
  'poster_path': 'https://image.tmdb.org/t/p/original/4ZiwyXtyxYDmMnNiVemCPgxY8X6.jpg'},
 {'movieId': 7745,
  'title': "Scent of Green Papaya, The (Mùi du du xhan - L'odeur de la papaye verte) (1993)",
  'genres': 'Drama',
  'imdbId': 107617,
  'tmdbId': 19552.0,
  'url': 'https://www.imdb.com/title/tt0107617',
  'rating_count': 2.0,
  'rating_avg': 4.0,
  'poster_path': 'https://image.tmdb.org/t/p/original/wh1JFM7myCSJvtaPX30sWYX2q66.jpg'},
 {'movieId': 2751,
  'title': 'From the Hip (1987)',
  'genres': 'Comedy|Drama',
  'imdbId': 93051,
  'tmdbId': 24081.0,
  'url': 'https://www.imdb.com/title/tt0093051',
  'rating_count': 2.0,
  'rating_avg': 4.0,
  'poster_path': 'https://image.tmdb.org/t/p/original/jkwzakMmaasylJc2Uj9LvL8bSOk.jpg'}]

In [105]:
import re # 정규 표현식 libraries

def latest_items(n):
    movies_df = pd.read_csv(item_frame)
    movies_df = movies_df.fillna('') #결측치 공백 처리 
    
    # 영화 제목에서 연도를 추출하는 함수 (예: "squid game(2021)" -> 2021)
    def extract_year(title):
        match = re.search(r'\((\d{4})\)', title)
        if match:
            return int(match.group(1))  # return match.group(1) # 정규 표현식에서 만든 객체중 첫 값만 출력 
        return None # 연도가 없으면 none 반환 
    
    # 연도 정보를 추출해서 새로운 'year' 컬럼에 추가 
    movies_df['year'] = movies_df['title'].apply(extract_year)
    
    # 연도를 기준으로 내림차순으로 정렬하고 최신 n개 항목 선택 
    latest_movie_df = movies_df.sort_values(by='year', ascending=False).head(n) # year 칼럼 기준으로 내림차순 (ascending=False)
    
    # 5개의 latest movie return
    result_items = latest_movie_df.to_dict('records')
    return result_items
    
    # return extract_year('squidgame (2021)')
    
# print(latest_items())
latest_items(5)

[{'movieId': 183295,
  'title': 'Insidious: The Last Key (2018)',
  'genres': 'Horror|Mystery|Thriller',
  'imdbId': 5726086,
  'tmdbId': 406563.0,
  'url': 'https://www.imdb.com/title/tt5726086',
  'rating_count': 2.0,
  'rating_avg': 3.5,
  'poster_path': 'https://image.tmdb.org/t/p/original/nb9fc9INMg8kQ8L7sE7XTNsZnUX.jpg',
  'year': 2018.0},
 {'movieId': 184987,
  'title': 'A Wrinkle in Time (2018)',
  'genres': 'Adventure|Children|Fantasy|Sci-Fi',
  'imdbId': 1620680,
  'tmdbId': 407451.0,
  'url': 'https://www.imdb.com/title/tt1620680',
  'rating_count': 2.0,
  'rating_avg': 3.0,
  'poster_path': 'https://image.tmdb.org/t/p/original/yAcb58vipewa1BfNit2RjE6boXA.jpg',
  'year': 2018.0},
 {'movieId': 187717,
  'title': "Won't You Be My Neighbor? (2018)",
  'genres': 'Documentary',
  'imdbId': 7681902,
  'tmdbId': 490003.0,
  'url': 'https://www.imdb.com/title/tt7681902',
  'rating_count': 2.0,
  'rating_avg': 5.0,
  'poster_path': 'https://image.tmdb.org/t/p/original/8qE8NZjiP2M884b

In [106]:
def genres_items(genre, n):
    movies_df = pd.read_csv(item_frame)
    genre_df = movies_df.fillna('') #결측치 공백 처리 
    genre_df = genre_df[genre_df['genres'].str.contains(genre, case=False, na=False)] # case = false 대소문자 구분안함  na false 결측치 상관안함 
    
    result_items = genre_df.sample(n=n).to_dict('records')
    return result_items

genres_items('Drama', 5)

[{'movieId': 2710,
  'title': 'Blair Witch Project, The (1999)',
  'genres': 'Drama|Horror|Thriller',
  'imdbId': 185937,
  'tmdbId': 2667.0,
  'url': 'https://www.imdb.com/title/tt0185937',
  'rating_count': 98.0,
  'rating_avg': 2.76,
  'poster_path': 'https://image.tmdb.org/t/p/original/9050VGrYjYrEjpOvDZVAngLbg1f.jpg'},
 {'movieId': 55067,
  'title': 'Requiem (2006)',
  'genres': 'Drama|Thriller',
  'imdbId': 454931,
  'tmdbId': 523.0,
  'url': 'https://www.imdb.com/title/tt0454931',
  'rating_count': 2.0,
  'rating_avg': 3.5,
  'poster_path': 'https://image.tmdb.org/t/p/original/aJ34ByMpzlXFKKdL8JYNhdWH5R5.jpg'},
 {'movieId': 2455,
  'title': 'Fly, The (1986)',
  'genres': 'Drama|Horror|Sci-Fi|Thriller',
  'imdbId': 91064,
  'tmdbId': 9426.0,
  'url': 'https://www.imdb.com/title/tt0091064',
  'rating_count': 71.0,
  'rating_avg': 3.38,
  'poster_path': 'https://image.tmdb.org/t/p/original/8gZWMhJHRvaXdXsNhERtqNHYpH3.jpg'},
 {'movieId': 4055,
  'title': 'Panic (2000)',
  'genres': 