# Machine Learning Pipeline - Sarcasm Detection Pipeline V1 (Base)

## Import Required Library

In [1]:
import tensorflow as tf
import tensorflow_transform as tft

import json

import pandas as pd
import zipfile as zf
import os

from tfx.components import CsvExampleGen, StatisticsGen, SchemaGen, ExampleValidator, Transform, Trainer, Tuner
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext

from tfx.dsl.input_resolution.strategies.latest_blessed_model_strategy import LatestBlessedModelStrategy 
from tfx.dsl.components.common.resolver import Resolver 


from tfx.types.standard_artifacts import Model, ModelBlessing
from tfx.types import Channel 

from tfx.components import Tuner
from tfx.components import Evaluator

from tfx.proto import trainer_pb2
from tfx.proto import example_gen_pb2

## Data Configuration

### Download Dataset

In [2]:
!kaggle datasets download -d rmisra/news-headlines-dataset-for-sarcasm-detection

!mkdir raw




  0%|          | 0.00/3.30M [00:00<?, ?B/s]
 30%|███       | 1.00M/3.30M [00:25<00:57, 41.6kB/s]
 61%|██████    | 2.00M/3.30M [00:25<00:13, 99.0kB/s]
 91%|█████████ | 3.00M/3.30M [00:25<00:01, 177kB/s] 
100%|██████████| 3.30M/3.30M [00:26<00:00, 133kB/s]



Dataset URL: https://www.kaggle.com/datasets/rmisra/news-headlines-dataset-for-sarcasm-detection
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading news-headlines-dataset-for-sarcasm-detection.zip to c:\Users\Rahfi\MLOps-Pipeline-Projects\SarcasmDetectionV1-Pipeline



In [4]:
!move news-headlines-dataset-for-sarcasm-detection.zip raw/

        1 file(s) moved.


### Extract File

In [5]:
files = "raw/news-headlines-dataset-for-sarcasm-detection.zip"
zip = zf.ZipFile(files, 'r')
zip.extractall('raw/')
zip.close()

### Data Converting

In [6]:
import json
import pandas as pd

!mkdir data
# Path to the JSON files
file_paths = [r"raw\Sarcasm_Headlines_Dataset_v2.json", r"raw\Sarcasm_Headlines_Dataset_v2.json"]
# Initialize a list to store the data
data_list = []
# Read and process each line as a separate JSON object
for file_path in file_paths:
    with open(file_path, 'r') as file:
        for line in file:
            data_list.append(json.loads(line))

# Normalize JSON data
df = pd.json_normalize(data_list)

### Export Data

In [7]:
df.to_csv("raw/data.csv", index=False)

### Data Loading

In [8]:
dataset = pd.read_csv("raw/data.csv")

In [9]:
dataset.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
