## For this section we'll only validate column names and datatypes

In [1]:
import os
import pandas as pd

In [8]:
%pwd

'/home/shekhar/ml_projects/recSys/research'

In [9]:
%cd ..

/home/shekhar/ml_projects/recSys


In [10]:
%pwd

'/home/shekhar/ml_projects/recSys'

In [11]:
%ls

app.py      Dockerfile  main.py      requirements.txt  setup.py     [0m[01;34mtemplates[0m/
[01;34martifacts[0m/  LICENSE     params.yaml  [01;34mresearch[0m/         [01;34msrc[0m/         test.py
[01;34mconfig[0m/     [01;34mlogs[0m/       README.md    schema.yaml       template.py  [01;34mvenv[0m/


In [12]:
path_books = "artifacts/data_ingestion/books.csv"
path_ratings = "artifacts/data_ingestion/ratings.csv"
genre = "artifacts/data_ingestion/book_tags.csv"
genre_meta = "artifacts/data_ingestion/tags.csv"

## reading the csv files

In [14]:
df_books = pd.read_csv(path_books)
df_ratings = pd.read_csv(path_ratings)
df_genre = pd.read_csv(genre)
df_genreId = pd.read_csv(genre_meta)

In [8]:
df_books.head(3).T

Unnamed: 0,0,1,2
id,1,2,3
book_id,2767052,3,41865
best_book_id,2767052,3,41865
work_id,2792775,4640799,3212258
books_count,272,491,226
isbn,439023483,439554934,316015849
isbn13,9780439023480.0,9780439554930.0,9780316015840.0
authors,Suzanne Collins,"J.K. Rowling, Mary GrandPré",Stephenie Meyer
original_publication_year,2008.0,1997.0,2005.0
original_title,The Hunger Games,Harry Potter and the Philosopher's Stone,Twilight


### checking datatype for books csv

In [9]:
df_books.columns

Index(['id', 'book_id', 'best_book_id', 'work_id', 'books_count', 'isbn',
       'isbn13', 'authors', 'original_publication_year', 'original_title',
       'title', 'language_code', 'average_rating', 'ratings_count',
       'work_ratings_count', 'work_text_reviews_count', 'ratings_1',
       'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 'image_url',
       'small_image_url'],
      dtype='object')

In [10]:
df_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         10000 non-null  int64  
 1   book_id                    10000 non-null  int64  
 2   best_book_id               10000 non-null  int64  
 3   work_id                    10000 non-null  int64  
 4   books_count                10000 non-null  int64  
 5   isbn                       9300 non-null   object 
 6   isbn13                     9415 non-null   float64
 7   authors                    10000 non-null  object 
 8   original_publication_year  9979 non-null   float64
 9   original_title             9415 non-null   object 
 10  title                      10000 non-null  object 
 11  language_code              8916 non-null   object 
 12  average_rating             10000 non-null  float64
 13  ratings_count              10000 non-null  int6

In [11]:
df_ratings.head(5)

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


### checking datatypes of ratings csv file

In [12]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981756 entries, 0 to 981755
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   book_id  981756 non-null  int64
 1   user_id  981756 non-null  int64
 2   rating   981756 non-null  int64
dtypes: int64(3)
memory usage: 22.5 MB


In [13]:
df_genre.head(3)

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173


In [14]:
df_genre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999912 entries, 0 to 999911
Data columns (total 3 columns):
 #   Column             Non-Null Count   Dtype
---  ------             --------------   -----
 0   goodreads_book_id  999912 non-null  int64
 1   tag_id             999912 non-null  int64
 2   count              999912 non-null  int64
dtypes: int64(3)
memory usage: 22.9 MB


In [15]:
df_genreId.head()

Unnamed: 0,tag_id,tag_name
0,0,-
1,1,--1-
2,2,--10-
3,3,--12-
4,4,--122-


In [16]:
df_genreId.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34252 entries, 0 to 34251
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tag_id    34252 non-null  int64 
 1   tag_name  34252 non-null  object
dtypes: int64(1), object(1)
memory usage: 535.3+ KB


In [43]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir : Path
    STATUS_FILE : str
    unzip_data_dir : Path
    schema : dict

In [44]:
list(df_genre.columns)

['goodreads_book_id', 'tag_id', 'count']

In [49]:
from recSys.constants import *
from recSys.utils.common import read_yaml, create_directories, validate_data

In [55]:
class ConfigurationManager:
    def __init__(
        self,
        config_path = CONFIG_FILE_PATH,
        params_path = PARAMS_FILE_PATH,
        schema_path = SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)
        self.schema = read_yaml(schema_path)

        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir = config.root_dir,
            STATUS_FILE = config.STATUS_FILE,
            unzip_data_dir = config.unzip_data_dir,
            schema = schema)
        
        return data_validation_config


In [37]:
c = ConfigurationManager()

[2023-08-16 21:15:35,301: INFO: common: yaml file: config/config.yaml loaded successfully]
[2023-08-16 21:15:35,309: INFO: common: yaml file: params.yaml loaded successfully]
[2023-08-16 21:15:35,362: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-08-16 21:15:35,371: INFO: common: created directory at: artifacts]


In [68]:
cols = list(df_books.columns)[:-2]
schema = c.schema.books
file_path = path_books

def 
status_cols = True
status_dtype = True

for col in schema.keys():
    if col not in cols:
        curr_status = False
        status_cols = status_cols and curr_status
        logger.warning(f"!!! for file {os.path.basename(file_path)} : {col} not present in data !!!")
    if df_books[col].dtype != schema[col]:
        print(col)
        curr_status = False
        status_dtype = status_dtype and curr_status
        logger.warning(f"!!! for file {os.path.basename(file_path)} : data mismatch for column {col} !!!")

status = status_cols and status_dtype
return status

False


In [11]:
c.config.data_validation.STATUS_FILE

'artifacts/data_validation/status.txt'

In [56]:
import os
from recSys import logger

In [61]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_data(self) -> bool:
        path = self.config.unzip_data_dir
        files = path.keys()
        status_file = self.config.STATUS_FILE
        schema = config.schema

        status = True
        
        try:
            with open(status_file, "w") as file:
                    logger.info(f"Accesed file {status_file} for making validation status")
            for f in files:
                curr_status = validate_data(path[f], schema[f])
                with open(status_file, "a") as file:
                    file.write(f"Validation status for {f}: {curr_status}\n")
            
                status = status and curr_status
            
            print(status)
        
        except Exception as e:
            print(e)
            raise e           
            

In [62]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    
    data_validation = DataValidation(config = data_validation_config)
    data_validation.validate_data()

except Exception as e:
    raise e

[2023-08-16 21:40:28,541: INFO: common: yaml file: config/config.yaml loaded successfully]
[2023-08-16 21:40:28,548: INFO: common: yaml file: params.yaml loaded successfully]
[2023-08-16 21:40:28,566: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-08-16 21:40:28,569: INFO: common: created directory at: artifacts]
[2023-08-16 21:40:28,572: INFO: common: created directory at: artifacts/data_validation]
[2023-08-16 21:40:28,574: INFO: 1303728464: Accesed file artifacts/data_validation/status.txt for making validation status]
True
