<a href="https://colab.research.google.com/github/subhasishsahugit/AI_ML/blob/main/UCLynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

# --- 1. IMPORT DATASET (Simulating the UCI Auto MPG mentioned in notes) ---
# We define column names because the raw UCI file lacks headers (Data Dictionary concept)
cols = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
        'Acceleration', 'Model Year', 'Origin', 'Car Name']
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"

# Reading the CSV: handling whitespace separator and identifying '?' as missing data
df = pd.read_csv(url, names=cols, sep='\s+', na_values='?')

# --- 2. HOW MANY (Shape & Structure) ---
print("=== 1. HOW MANY (Dimensions) ===")
print(f"Total Rows (Observations): {df.shape[0]}")
print(f"Total Columns (Attributes): {df.shape[1]}")
print("-" * 50)

# --- 3. WHAT ARE THE NAMES & DTYPES (Metadata) ---
print("\n=== 2. WHAT IS IT (Info & Types) ===")
# This replaces the need for a separate Data Dictionary if one isn't available
df.info()
print("-" * 50)

# --- 4. VIEW DATA (Head, Body/Sample, Tail) ---
print("\n=== 3. VIEWING THE DATA ===")
print("--- HEAD (First 5) ---")
display(df.head())

print("\n--- BODY (Random Sample of 5) ---")
display(df.sample(5))

print("\n--- TAIL (Last 5) ---")
display(df.tail())
print("-" * 50)

# --- 5. WHAT IT HAS (Central Tendency: Mean, Median, Std Dev) ---
print("\n=== 4. WHAT IT HAS (Statistical Summary) ===")
# This calculates the formula (S = sqrt...) shown in your handwritten notes
display(df.describe(include='all'))
print("-" * 50)

# --- 6. DATA HEALTH (Nulls & Uniques) ---
print("\n=== 5. HOW MUCH / DATA HEALTH ===")
print("Missing Values (Null Count):\n", df.isnull().sum())
print("\nUnique Values (Cardinality):\n", df.nunique())

  df = pd.read_csv(url, names=cols, sep='\s+', na_values='?')


=== 1. HOW MANY (Dimensions) ===
Total Rows (Observations): 398
Total Columns (Attributes): 9
--------------------------------------------------

=== 2. WHAT IS IT (Info & Types) ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           398 non-null    float64
 1   Cylinders     398 non-null    int64  
 2   Displacement  398 non-null    float64
 3   Horsepower    392 non-null    float64
 4   Weight        398 non-null    float64
 5   Acceleration  398 non-null    float64
 6   Model Year    398 non-null    int64  
 7   Origin        398 non-null    int64  
 8   Car Name      398 non-null    object 
dtypes: float64(5), int64(3), object(1)
memory usage: 28.1+ KB
--------------------------------------------------

=== 3. VIEWING THE DATA ===
--- HEAD (First 5) ---


Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin,Car Name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino



--- BODY (Random Sample of 5) ---


Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin,Car Name
280,21.5,6,231.0,115.0,3245.0,15.4,79,1,pontiac lemans v6
62,13.0,8,350.0,165.0,4274.0,12.0,72,1,chevrolet impala
350,34.7,4,105.0,63.0,2215.0,14.9,81,1,plymouth horizon 4
21,24.0,4,107.0,90.0,2430.0,14.5,70,2,audi 100 ls
243,21.5,3,80.0,110.0,2720.0,13.5,77,3,mazda rx-4



--- TAIL (Last 5) ---


Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin,Car Name
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,vw pickup
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger
397,31.0,4,119.0,82.0,2720.0,19.4,82,1,chevy s-10


--------------------------------------------------

=== 4. WHAT IT HAS (Statistical Summary) ===


Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin,Car Name
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0,398.0,398
unique,,,,,,,,,305
top,,,,,,,,,ford pinto
freq,,,,,,,,,6
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005,1.572864,
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627,0.802055,
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0,
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0,1.0,
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0,1.0,
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0,2.0,


--------------------------------------------------

=== 5. HOW MUCH / DATA HEALTH ===
Missing Values (Null Count):
 MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
Model Year      0
Origin          0
Car Name        0
dtype: int64

Unique Values (Cardinality):
 MPG             129
Cylinders         5
Displacement     82
Horsepower       93
Weight          351
Acceleration     95
Model Year       13
Origin            3
Car Name        305
dtype: int64
