In [82]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from datetime import datetime


## Load Dataset

In [71]:
FILE_NAME = 'cleaned_dataset.csv'
df = pd.read_csv(FILE_NAME)

## Null Values
Checking for null values in each column to evaluate Completeness and Count of Null


In [72]:
null_counts = df.isnull().sum()

number_of_records = len(df)
number_of_records

10840

In [73]:
results_df = pd.DataFrame(df.columns, columns=['Feature Name'])
results_df['Count of Null'] = results_df['Feature Name'].apply(lambda x: null_counts[x])
results_df['Number of Records'] = number_of_records

In [74]:
# Placeholder values for qualitative evaluations
for factor in ['Consistency', 'Currentness', 'Validity', 'Completeness', 'Accuracy']:
    results_df[factor] = "To be evaluated"

## Validation

### Logic

In [79]:
def validate_app(app):
    return isinstance(app, str)


def validate_category(category):
    return isinstance(category, str)


def validate_rating(rating):
    return 0.0 <= rating <= 5.0


def validate_reviews(reviews):
    try:
        return int(reviews) >= 0
    except ValueError:
        return False


def validate_size(size):
    pattern = r'^\d+(\.\d+)?[MG]$'
    if re.match(pattern, size):
        return True
    else:
        return False


def validate_installs(installs):
    return isinstance(installs, float) and installs > 0


def validate_type(dtype):
    return isinstance(dtype, str)



def validate_price(price):
    try:
        price_value = float(price)
        return price_value >= 0
    except ValueError:
        return False


def validate_content_rating(content_rating):
    return isinstance(content_rating, str)


def validate_genres(genres):
    return isinstance(genres, str)


def validate_last_updated(date_str):
    try:
        # Try to parse the date string using the specified format
        datetime.strptime(date_str, '%d-%b-%y')
        return True  # Parsing succeeded
    except ValueError:
        return False  # Parsing failed

### Apply Logic and Percentage Calulation

In [83]:
df['App_Valid']           = df['App'].apply(validate_app)
df['Category_Valid']      = df['Category'].apply(validate_category)
df['Rating_Valid']        = df['Rating'].apply(validate_rating)
df['Reviews_Valid']       = df['Reviews'].apply(validate_reviews)
df['Size_Valid']          = df['Size'].apply(validate_size)
df['Installs_Valid']      = df['Installs'].apply(validate_installs)
df['Type_Valid']          = df['Type'].apply(validate_type)
df['Price_Valid']         = df['Price'].apply(validate_price)
df['ContentRating_Valid'] = df['Content Rating'].apply(validate_content_rating)
df['Genres_Valid']        = df['Genres'].apply(validate_genres)
df['LastUpdated_Valid']    = df['Genres'].apply(validate_last_updated)

In [85]:
validity_percentages = {
    'App': df['App_Valid'].mean() * 100,
    'Category': df['Category_Valid'].mean() * 100,
    'Rating': df['Rating_Valid'].mean() * 100,
    'Reviews': df['Reviews_Valid'].mean() * 100,
    'Size': df['Size_Valid'].mean() * 100,
    'Installs': df['Installs_Valid'].mean() * 100,
    'Type': df['Type_Valid'].mean() * 100,
    'Price': df['Price_Valid'].mean() * 100,
    'Content_Rating': df['ContentRating_Valid'].mean() * 100,
    'Genres': df['Genres_Valid'].mean() * 100,
    'Last_Updated': df['LastUpdated_Valid'].mean() * 100,
}
validity_percentages

{'App': 100.0,
 'Category': 100.0,
 'Rating': 86.40221402214023,
 'Reviews': 100.0,
 'Size': 81.44833948339483,
 'Installs': 99.86162361623616,
 'Type': 99.99077490774908,
 'Price': 92.619926199262,
 'Content_Rating': 100.0,
 'Genres': 100.0,
 'Last_Updated': 0.0}

In [86]:
results_df.at[1, 'Validity'] = validity_percentages['App']
results_df.at[2, 'Validity'] = validity_percentages['Category']
results_df.at[3, 'Validity'] = validity_percentages['Rating']
results_df.at[4, 'Validity'] = validity_percentages['Reviews']
results_df.at[5, 'Validity'] = validity_percentages['Size']
results_df.at[6, 'Validity'] = validity_percentages['Installs']
results_df.at[7, 'Validity'] = validity_percentages['Type']
results_df.at[8, 'Validity'] = validity_percentages['Price']
results_df.at[9, 'Validity'] = validity_percentages['Content_Rating']
results_df.at[10, 'Validity'] = validity_percentages['Genres']
results_df.at[11, 'Validity'] = validity_percentages['Last_Updated']

In [87]:
results_df

Unnamed: 0,Feature Name,Count of Null,Number of Records,Consistency,Currentness,Validity,Completeness,Accuracy
0,Unnamed: 0,0,10840,To be evaluated,To be evaluated,To be evaluated,To be evaluated,To be evaluated
1,App,0,10840,To be evaluated,To be evaluated,100.0,To be evaluated,To be evaluated
2,Category,0,10840,To be evaluated,To be evaluated,100.0,To be evaluated,To be evaluated
3,Rating,1474,10840,To be evaluated,To be evaluated,86.402214,To be evaluated,To be evaluated
4,Reviews,0,10840,To be evaluated,To be evaluated,100.0,To be evaluated,To be evaluated
5,Size,0,10840,To be evaluated,To be evaluated,81.448339,To be evaluated,To be evaluated
6,Installs,0,10840,To be evaluated,To be evaluated,99.861624,To be evaluated,To be evaluated
7,Type,1,10840,To be evaluated,To be evaluated,99.990775,To be evaluated,To be evaluated
8,Price,0,10840,To be evaluated,To be evaluated,92.619926,To be evaluated,To be evaluated
9,Content Rating,0,10840,To be evaluated,To be evaluated,100.0,To be evaluated,To be evaluated
