Step 1: Extract Data

In [1]:
import os
import kaggle
import pandas as pd
import sqlalchemy as sa

In [2]:
# 配置 Kaggle API 密钥路径
os.environ['KAGGLE_CONFIG_DIR'] = os.path.expanduser('~/.kaggle')

# Download Datasets
dataset = 'shivamb/netflix-shows'
kaggle.api.dataset_download_files(dataset, path='.', unzip=False)

Dataset URL: https://www.kaggle.com/datasets/shivamb/netflix-shows


In [3]:
# unzip file
import zipfile
zip_ref = zipfile.ZipFile('netflix-shows.zip', 'r')
zip_ref.extractall() #extract file
zip_ref.close()

In [2]:
#read data from the file 
df = pd.read_csv('netflix_titles.csv')
df.head(20)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
5,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, H...",,"September 24, 2021",2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",The arrival of a charismatic young priest brin...
6,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",,"September 24, 2021",2021,PG,91 min,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,"September 24, 2021",2021,TV-14,9 Seasons,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...
9,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,"September 24, 2021",2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...


In [3]:
len(df)

8807

In [24]:
df[df.show_id == 's5023']

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
5022,s5023,Movie,반드시 잡는다,Hong-seon Kim,Baek Yoon-sik,South Korea,"February 28, 2018",2017,TV-MA,110 min,"Dramas, International Movies, Thrillers",After people in his town start turning up dead...


In [21]:
# Load the datasets into SQL server database
engine = sa.create_engine(
    'mssql+pyodbc://sa:Ifeng%408276_@localhost:1433/SQL_projects?driver=ODBC+Driver+17+for+SQL+Server'
)
conn = engine.connect()

In [8]:
# Calculate the max length of character in each column
max_lengths = df.astype(str).apply(lambda x: x.str.len()).max()

print(max_lengths)

show_id           5
type              7
title           104
director        208
cast            771
country         123
date_added       19
release_year      4
rating            8
duration         10
listed_in        79
description     248
dtype: int64


In [22]:
# To avoid data type errors and space waste, it is better to define the field types before inserting the data.
metadata = sa.MetaData()
orders = sa.Table('netflix_raw', metadata,
                  sa.Column('show_id', sa.Unicode(10), primary_key=True),
                  sa.Column('type', sa.Unicode(10), nullable=True),
                  sa.Column('title', sa.Unicode(110), nullable=True),
                  sa.Column('director', sa.Unicode(210), nullable=True),
                  sa.Column('cast', sa.Unicode(780), nullable=True),
                  sa.Column('country', sa.Unicode(130), nullable=True),
                  sa.Column('date_added', sa.Unicode(20), nullable=True),
                  sa.Column('release_year', sa.Unicode(5), nullable=True),
                  sa.Column('rating', sa.Unicode(10), nullable=True),
                  sa.Column('duration', sa.Unicode(80), nullable=True),
                  sa.Column('listed_in', sa.Unicode(100), nullable=True),
                  sa.Column('description', sa.Unicode(250), nullable=True),
                  )
metadata.create_all(engine)

#Also we can use Create statement to create table in SQL server
# CREATE TABLE netflix_shows_raw (
#     show_id NVARCHAR(10) PRIMARY KEY,type NVARCHAR(10),
#         title NVARCHAR(110),
#         director NVARCHAR(210),
#         cast NVARCHAR(780),
#         country NVARCHAR(130),
#         date_added NVARCHAR(20),
#         release_year NVARCHAR(5),
#         rating NVARCHAR(10),
#         duration NVARCHAR(80),
#         listed_in NVARCHAR(100),
#         description NVARCHAR(250)
# );

In [25]:
with engine.connect() as conn:
    df.to_sql(name='netflix_raw', con=conn, index=False, if_exists='append')

conn.close()