In [None]:
import subprocess
import sys
from pathlib import Path
import importlib

def install_from_requirements(req_file: str = "requirements.txt"):
    """
        requirements.txt 파일을 읽어서
        비어있거나 주석(#)인 줄을 건너 뛰고,
        나머지를 pip install 설치한다
    """
    path = Path(req_file)
    if not path.is_file():
        print(f"ERROR: `{req_file}` 파일을 찾을 수 없습니다.")
        sys.exit(1)
    
    with path.open(encoding="UTF-8") as f:
        lines = [line.strip() for line in f]
    pkgs = [line for line in lines if line and not line.startswith("#")]

    if not pkgs:
        print("설치할 패키지가 없습니다.")
        return

    for spec in pkgs:
        print(f"Installing {spec} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", spec])


In [2]:
install_from_requirements()

Installing numpy==1.26.0 ...
Installing pandas>=2.2.3 ...
Installing matplotlib>=3.10.1 ...
Installing notebook ...
Installing seaborn==0.13.2 ...


In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [10]:
df = pd.read_csv('./Data Files/Movie_regression.csv', header=0)

print(df.info())
print(df.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Marketing expense    506 non-null    float64
 1   Production expense   506 non-null    float64
 2   Multiplex coverage   506 non-null    float64
 3   Budget               506 non-null    float64
 4   Movie_length         506 non-null    float64
 5   Lead_ Actor_Rating   506 non-null    float64
 6   Lead_Actress_rating  506 non-null    float64
 7   Director_rating      506 non-null    float64
 8   Producer_rating      506 non-null    float64
 9   Critic_rating        506 non-null    float64
 10  Trailer_views        506 non-null    int64  
 11  3D_available         506 non-null    object 
 12  Time_taken           494 non-null    float64
 13  Twitter_hastags      506 non-null    float64
 14  Genre                506 non-null    object 
 15  Avg_age_actors       506 non-null    int

In [12]:
# missing value imputation
avg = df['Time_taken'].mean()             # 평균
print(avg)

157.3914979757085


In [19]:
# 결측치는 평균값으로 채워넣었다 (inplace 교체)
df['Time_taken'].fillna(value = avg, inplace = True)

In [None]:
# 엑셀로 저장해보기
import openpyxl
df.to_excel("data.xlsx", index=False)

In [None]:
# 채워졌는지 정보 다시 확인 (row수)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Marketing expense    506 non-null    float64
 1   Production expense   506 non-null    float64
 2   Multiplex coverage   506 non-null    float64
 3   Budget               506 non-null    float64
 4   Movie_length         506 non-null    float64
 5   Lead_ Actor_Rating   506 non-null    float64
 6   Lead_Actress_rating  506 non-null    float64
 7   Director_rating      506 non-null    float64
 8   Producer_rating      506 non-null    float64
 9   Critic_rating        506 non-null    float64
 10  Trailer_views        506 non-null    int64  
 11  3D_available         506 non-null    object 
 12  Time_taken           506 non-null    float64
 13  Twitter_hastags      506 non-null    float64
 14  Genre                506 non-null    object 
 15  Avg_age_actors       506 non-null    int