In [1]:
!pip install great_expectations praw kagglehub pandas yfinance

Collecting great_expectations
  Downloading great_expectations-1.3.7-py3-none-any.whl.metadata (8.5 kB)
Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting altair<5.0.0,>=4.2.1 (from great_expectations)
  Downloading altair-4.2.2-py3-none-any.whl.metadata (13 kB)
Collecting marshmallow<4.0.0,>=3.7.1 (from great_expectations)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting posthog<4,>3 (from great_expectations)
  Downloading posthog-3.15.1-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting ruamel.yaml>=0.16 (from great_expectations)
  Downloading ruamel.yaml-0.18.10-py3-none-any.whl.metadata (23 kB)
Collecting pandas
  Downloading pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadat

In [2]:
import csv
import praw
import kagglehub
import pandas as pd
import yfinance as yf
import great_expectations as gx
from kagglehub import KaggleDatasetAdapter

In [3]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/great-expectations/gx_tutorials/main/data/yellow_tripdata_sample_2019-01.csv"
)

In [15]:
df.tail()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,pickup_location_id,dropoff_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
9995,2,2019-01-02 07:48:44,2019-01-02 08:00:13,6,1.07,1,N,50,161,2,8.5,0.0,0.5,0.0,0.0,0.3,9.3,
9996,2,2019-01-16 19:06:45,2019-01-16 19:10:05,6,0.35,1,N,234,234,1,4.0,1.0,0.5,1.16,0.0,0.3,6.96,
9997,2,2019-01-02 09:10:44,2019-01-02 09:36:46,6,4.12,1,N,50,236,1,20.0,0.0,0.5,6.24,0.0,0.3,27.04,
9998,2,2019-01-03 13:28:36,2019-01-03 13:36:42,6,1.17,1,N,137,234,1,7.0,0.0,0.5,0.9,0.0,0.3,8.7,
9999,2,2019-01-26 17:48:59,2019-01-26 18:03:01,6,2.44,1,N,161,236,2,11.0,0.0,0.5,0.0,0.0,0.3,11.8,0.0


# Using Great Expectations

In [11]:
## Example Setup

context = gx.get_context()
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": df})

expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column="passenger_count", min_value=1, max_value=6
)

validation_result = batch.validate(expectation)

INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmpllksh2pq' for ephemeral docs site


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

In [12]:
validation_result.describe_dict()

{'expectation_type': 'expect_column_values_to_be_between',
 'success': True,
 'kwargs': {'batch_id': 'pandas-pd dataframe asset',
  'column': 'passenger_count',
  'min_value': 1.0,
  'max_value': 6.0},
 'result': {'element_count': 10000,
  'unexpected_count': 0,
  'unexpected_percent': 0.0,
  'partial_unexpected_list': [],
  'missing_count': 0,
  'missing_percent': 0.0,
  'unexpected_percent_total': 0.0,
  'unexpected_percent_nonmissing': 0.0,
  'partial_unexpected_counts': [],
  'partial_unexpected_index_list': []}}

#**To-do:**

1.   Fetch any dataset from online source. I'll recommend using reddit API used in the assignment #1.
2.   Run 5 expectations of your choice to validate the dataset. They should cover row, column, multi-column, table, distribution operation.
3.   Submit the notebook file on LMS before Monday 11:55 PM.
4.   Mention your group number in the name of the file.



In [None]:
def get_reddit_data(client, secret, agent):
    reddit = praw.Reddit(client_id =client,
                         client_secret =secret,
                         user_agent = agent)
    subreddits = reddit.subreddits.search_by_name('crypto')
    titles=[]
    texts=[]
    authors=[]
    dates=[]
    upvotes=[]
    subreddit_names=[]
    for subreddit in subreddits:
        for submission in reddit.subreddit(subreddit.display_name).hot(limit=50):
            titles.append(submission.title)
            texts.append(submission.selftext)
            authors.append(submission.author)
            dates.append(submission.created_utc)
            upvotes.append(submission.score)
            subreddit_names.append(subreddit.display_name)


    reddit_df = pd.DataFrame({
        'Subreddit': subreddit_names,
        'Title': titles,
        'Author': authors,
        'Upvotes': upvotes,
        'Datetime': dates,
        'Text': texts
    })
    return reddit_df

def preprocess_reddit(reddit_df):
    reddit_df.dropna(inplace=True)
    reddit_df = reddit_df[reddit_df['Text'].str.strip() != '']
    reddit_df.reset_index(drop=True)
    return reddit_df

def summarize(df):
    print(df.head())

def save_to_csv(df, filename):
    header = df.columns.tolist()
    rows = df.values.tolist()

    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(header)
        writer.writerows(rows)

df = get_reddit_data(client ='ic70bWPRnEJzRQnZBAGdCQ',
                    secret ='IgdtzNFf1It0ddCaPVRlqzWqBNv2lQ',
                    agent ='DataEng')
df = preprocess_reddit(df)
summarize(df)
#save_to_csv(df, 'reddit.csv')

In [17]:
context = gx.get_context()
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": df})

expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column="passenger_count", min_value=1, max_value=6
)

validation_result = batch.validate(expectation)

INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmpi3wnqicz' for ephemeral docs site


In [19]:
df.head()

Unnamed: 0,Subreddit,Title,Author,Upvotes,Datetime,Text
0,crypto,[Meta] Regarding the future of the subreddit,Natanael_L,105,1686522000.0,A bit late notice compared to a lot of the oth...
3,crypto,Weekly cryptography community and meta thread,AutoModerator,2,1740395000.0,Welcome to /r/crypto's weekly community thread...
6,crypto,Seeking References on Constraint Optimization ...,Accomplished-One-289,4,1740327000.0,"Hello everyone,\n\nI am a university student c..."
9,crypto,How far can i push close-source code towards b...,Accurate-Screen8774,4,1740130000.0,im familiar with Kerckhoffs principle and the ...
13,crypto,How much of Coding Theory needs to be learnt f...,HenryDaHorse,7,1739963000.0,I don't know Coding Theory at all - not even H...


In [36]:
df.Upvotes.mean()

57.50809061488673

In [25]:
type_expectation = gx.expectations.ExpectColumnValuesToBeOfType(
    column="Upvotes", type_="int"
)

type_validation_result = batch.validate(type_expectation)
print(type_validation_result.describe_dict())

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{'expectation_type': 'expect_column_values_to_be_of_type', 'success': True, 'kwargs': {'batch_id': 'pandas-pd dataframe asset', 'column': 'Upvotes', 'type_': 'int'}, 'result': {'observed_value': 'int64'}}


In [29]:
regex_expectation = gx.expectations.ExpectColumnValuesToMatchRegex(
    column="Text",
    regex=r"\S"
)

regex_result = batch.validate(regex_expectation)
print(regex_result.describe_dict())

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

{'expectation_type': 'expect_column_values_to_match_regex', 'success': True, 'kwargs': {'batch_id': 'pandas-pd dataframe asset', 'column': 'Text', 'regex': '\\S'}, 'result': {'element_count': 309, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': [], 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_percent_total': 0.0, 'unexpected_percent_nonmissing': 0.0, 'partial_unexpected_counts': [], 'partial_unexpected_index_list': []}}


In [None]:
multi_col_expectation = gx.expectations.ExpectMulticolumnValuesToBeUnique(
    column_list=["Author", "Title", "Text"], catch_exceptions=False
)

multicol_validation_result = batch.validate(multi_col_expectation)

In [34]:
row_expectation = gx.expectations.ExpectTableRowCountToBeBetween(min_value=100, max_value=1000)

row_validation_result = batch.validate(row_expectation)
print(row_validation_result.describe_dict())

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{'expectation_type': 'expect_table_row_count_to_be_between', 'success': True, 'kwargs': {'batch_id': 'pandas-pd dataframe asset', 'min_value': 100, 'max_value': 1000}, 'result': {'observed_value': 309}}


In [35]:
table_col_expectation = gx.expectations.ExpectTableColumnCountToBeBetween(min_value=100, max_value=1000)

table_col_validation = batch.validate(table_col_expectation)
print(table_col_validation.describe_dict())

Calculating Metrics:   0%|          | 0/3 [00:00<?, ?it/s]

{'expectation_type': 'expect_table_column_count_to_be_between', 'success': False, 'kwargs': {'batch_id': 'pandas-pd dataframe asset', 'min_value': 100.0, 'max_value': 1000.0}, 'result': {'observed_value': 6}}


In [37]:
distribution_expectation = gx.expectations.ExpectColumnMeanToBeBetween(column = 'Upvotes', min_value=10, max_value=100)

distribution_validation = batch.validate(distribution_expectation)
print(distribution_validation.describe_dict())

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{'expectation_type': 'expect_column_mean_to_be_between', 'success': True, 'kwargs': {'batch_id': 'pandas-pd dataframe asset', 'column': 'Upvotes', 'min_value': 10.0, 'max_value': 100.0}, 'result': {'observed_value': 57.50809061488673}}
