In [1]:
import pandas as pd
import numpy as np

In [2]:
# Create sample dataset
data = {
    "Name": ["Saee_Mane", "Kishori_IT", "Charlie_Finance", "David_HR", "Shravani_HR"],
    "Age": [21, 20, 25, 26, np.nan],
    "Department": ["HR", "IT", "Finance", np.nan, "HR"],
    "Salary": [100000, 50000, 30000, 10000, 45000],
    "Status": ["Adult", "Minor", "Adult", "Adult", "Adult"],
    "Promoted Salary": [1000000, 500000, 300000, 100000, 450000]
}

df = pd.DataFrame(data)

print(df)

              Name   Age Department  Salary Status  Promoted Salary
0        Saee_Mane  21.0         HR  100000  Adult          1000000
1       Kishori_IT  20.0         IT   50000  Minor           500000
2  Charlie_Finance  25.0    Finance   30000  Adult           300000
3         David_HR  26.0        NaN   10000  Adult           100000
4      Shravani_HR   NaN         HR   45000  Adult           450000


# Dataset Health Check

In [3]:
def data_health1(df):
    report = {}
    report['shape'] = df.shape
    report['Columns'] = df.columns
    report['missing_values']  = df.isnull().sum()
    report['data_types'] = df.dtypes
    report['memeory_usage_MB'] = round(df.memory_usage(deep = True).sum()/1024**2 , 2)

    return report

In [4]:
d = data_health1(df)
d

{'shape': (5, 6),
 'Columns': Index(['Name', 'Age', 'Department', 'Salary', 'Status', 'Promoted Salary'], dtype='object'),
 'missing_values': Name               0
 Age                1
 Department         1
 Salary             0
 Status             0
 Promoted Salary    0
 dtype: int64,
 'data_types': Name                object
 Age                float64
 Department          object
 Salary               int64
 Status              object
 Promoted Salary      int64
 dtype: object,
 'memeory_usage_MB': np.float64(0.0)}

In [5]:
def data_health2(df):
    return {
        "rows": df.shape[0],
        "columns": df.shape[1],
        "missing_values": df.isnull().sum().to_dict(),
        "duplicates": int(df.duplicated().sum())
    }
res = data_health2(df)
res

{'rows': 5,
 'columns': 6,
 'missing_values': {'Name': 0,
  'Age': 1,
  'Department': 1,
  'Salary': 0,
  'Status': 0,
  'Promoted Salary': 0},
 'duplicates': 0}

In [6]:
df.dtypes

Name                object
Age                float64
Department          object
Salary               int64
Status              object
Promoted Salary      int64
dtype: object

In [7]:
df["Age"] = df["Age"].fillna(0).astype(int)


def column_summary(df):
    return{
        "Age": df['Age'],
        "Salary": df['Salary'],
        "Department":df["Department"],
        "Status":df["Status"] ,
        "Mean Salary": df["Salary"].mean(),
        "Max Salary": df["Salary"].max(),
        "Min Salary": df["Salary"].min(),
        "Most Common Department": df["Department"].mode()[0]

    }
c = column_summary(df)
c

{'Age': 0    21
 1    20
 2    25
 3    26
 4     0
 Name: Age, dtype: int64,
 'Salary': 0    100000
 1     50000
 2     30000
 3     10000
 4     45000
 Name: Salary, dtype: int64,
 'Department': 0         HR
 1         IT
 2    Finance
 3        NaN
 4         HR
 Name: Department, dtype: object,
 'Status': 0    Adult
 1    Minor
 2    Adult
 3    Adult
 4    Adult
 Name: Status, dtype: object,
 'Mean Salary': np.float64(47000.0),
 'Max Salary': np.int64(100000),
 'Min Salary': np.int64(10000),
 'Most Common Department': 'HR'}

# Memory Usage

In [8]:
print(df.memory_usage(deep = True).sum()/1024 ** 2/2)

0.0005059242248535156


# Speed Test

In [9]:
import time

start = time.time()

df['Department'].fillna(df['Department'].mode()[0] , inplace = True) 

end = time.time()

print("Time taken to replace missing values in department column with the mode " , end - start , "seconds")

Time taken to replace missing values in department column with the mode  0.0010066032409667969 seconds


# Accuracy Check - To verify promoted salary is always 10Ã— salary

In [10]:
y_true = df['Salary'] * 10
y_pred = df['Promoted Salary']

accuracy = (y_pred == y_true).mean()
print("Accuracy:" , accuracy)

Accuracy: 1.0


# Exporting any dataset

In [11]:
def export_csv(df, filename):
    df.to_csv(filename, index=False)

export_csv(df , "Titanic-Dataset.csv")

In [13]:
import openpyxl
print(openpyxl.__version__)


3.1.5


In [14]:
def export_excel(df , filename):
    df.to_excel(filename , index = False)

export_excel(df , "CaseStudy.xlsx")

In [16]:
def export_json(df , filename):
    df.to_json(filename , index = False)

export_json(df , "ahirani_to_marathi.json")

In [17]:
def export_report(df, filename):
    f = open(filename, "w")

    f.write("DATA REPORT\n")
    f.write("Rows: " + str(len(df)) + "\n")
    f.write("Columns: " + str(len(df.columns)) + "\n\n")

    f.write("Missing Values:\n")
    for col in df.columns:
        f.write(col + " " + str(df[col].isnull().sum()) + "\n")

    f.close()

export_report(df , "dataset_report.txt")