In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pipeline ETL - IMDB TV Shows\n",
    "Este notebook executa a extração, transformação e carga (ETL) dos dados."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Extração: Leitura do arquivo CSV\n",
    "file_path = '/mnt/data/imdb_top_5000_tv_shows.csv'\n",
    "df = pd.read_csv(file_path)\n",
    "print('Linhas e Colunas:', df.shape)\n",
    "print(df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Transformação dos Dados\n",
    "df['imdbRating'] = pd.to_numeric(df['imdbRating'], errors='coerce')\n",
    "df['votes'] = pd.to_numeric(df['votes'], errors='coerce')\n",
    "\n",
    "# Remover registros sem avaliação\n",
    "df = df.dropna(subset=['imdbRating'])\n",
    "\n",
    "# Processar a coluna de gêneros: criar uma lista de gêneros\n",
    "if 'genres' in df.columns:\n",
    "    df['genre_list'] = df['genres'].apply(lambda x: [g.strip() for g in x.split(',')] if isinstance(x, str) else [])\n",
    "\n",
    "# Remover duplicatas\n",
    "df = df.drop_duplicates()\n",
    "print(df.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Carga: Salvar o DataFrame transformado\n",
    "output_path = '/mnt/data/imdb_top_5000_tv_shows_transformado.csv'\n",
    "df.to_csv(output_path, index=False)\n",
    "print('Dados transformados salvos em:', output_path)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.x"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
