In [None]:
# ==============================================
# ADVANCED DEMO: Data Cleaning, Normalization, Tokenization, Labeling, Augmentation
# ==============================================

import pandas as pd
import json
import re
from collections import Counter
import matplotlib.pyplot as plt
import random

In [None]:
# STRUCTURED DATA — Cleaning & Normalization
# =======================================
print("### STRUCTURED DATA CLEANING & NORMALIZATION ###\n")

data = {
    'EmployeeID': [101, 102, 103, 104],
    'Name': ['Ravi ', ' PRIYA', 'Amit', ' '],
    'Department': ['Finance', 'HR', 'IT', None],
    'Salary': [65000, 55000, None, 72000]
}

df = pd.DataFrame(data)
print("Raw Data:\n", df, "\n")

# Cleaning — remove extra spaces, fill missing values
df['Name'] = df['Name'].str.strip().str.title()
df['Department'].fillna('Unknown', inplace=True)
df['Salary'].fillna(df['Salary'].mean(), inplace=True)

# Normalization — scale salary (min-max normalization)
#Xnorm​=X​−Xmin/Xmax−Xmin​​
df['Salary_Normalized'] = (df['Salary'] - df['Salary'].min()) / (df['Salary'].max() - df['Salary'].min())

print("Cleaned & Normalized Data:\n", df, "\n")

In [None]:
# =======================================
#  SEMI-STRUCTURED DATA — Flatten & Label
# =======================================
print("### SEMI-STRUCTURED DATA NORMALIZATION & LABELING ###\n")

json_data = '''
[
  {"name": "Ravi", "skills": ["Excel", "SAP"], "details": {"city": "Delhi", "age": 29}},
  {"name": "Priya", "skills": ["Recruitment", "Communication"], "details": {"city": "Mumbai", "age": 31}},
  {"name": "Amit", "skills": ["Python", "AWS"], "details": {"city": "Bangalore", "age": 28}}
]
'''

parsed = json.loads(json_data)
df_json = pd.json_normalize(parsed)
print(df_json)
# Add a label based on skills (simple rule-based labeling)
def label_role(skills):
    if "Python" in skills or "AWS" in skills:
        return "Tech"
    elif "Recruitment" in skills:
        return "HR"
    else:
        return "Finance"

df_json["Role_Label"] = df_json["skills"].apply(label_role)
print("Labeled JSON Data:\n", df_json, "\n")


In [None]:
# =======================================
#  UNSTRUCTURED DATA — Cleaning, Tokenization & Augmentation
# =======================================
print("### UNSTRUCTURED TEXT PROCESSING ###\n")

text_data = """
Ravi joined Finance department last year.
Priya from HR got promoted recently!
Amit and Ravi is working on cloud migration in IT.
"""

print("Raw Text:\n", text_data)

# Cleaning — remove punctuation, lowercase
cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text_data).lower()
print("\nCleaned Text:\n", cleaned_text)

# Tokenization — split into words
tokens = cleaned_text.split()
print("\nTokens:\n", tokens)

# Labeling — identify entities (simple keyword-based)
labels = []
for word in tokens:
    if word in ['ravi', 'priya', 'amit']:
        labels.append((word, 'PERSON'))
    elif word in ['finance', 'hr', 'it']:
        labels.append((word, 'DEPARTMENT'))
print("\nLabeled Tokens (NER-style):\n", labels)

# Augmentation — simulate adding new data (synonym or phrase variation)
augment_phrases = [
    "Ravi started working remotely.",
    "Priya manages recruitment operations.",
    "Amit built an AWS cloud project."
]
augmented_text = text_data + "\n" + random.choice(augment_phrases)
print("\nAugmented Text:\n", augmented_text, "\n")

In [None]:
# =======================================
#  VISUAL INSIGHT
# =======================================
print("### FREQUENCY DISTRIBUTION ###\n")
words = re.findall(r'\b\w+\b', cleaned_text)
word_freq = Counter(words)
print("Top 5 frequent words:", word_freq.most_common(5), "\n")

plt.bar([w for w, _ in word_freq.most_common(5)], [c for _, c in word_freq.most_common(5)])
plt.title("Top 5 Frequent Words in Text")
plt.show()

print(" DEMO COMPLETED SUCCESSFULLY!\n")