# Week 1 Dataset Analysis

This notebook analyzes the Week1_GA_dataset.csv to answer the given questions.

In [1]:
# Import required libraries
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('Week1_GA_dataset.csv')

print(f"Original dataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

Original dataset shape: (10000, 12)

First few rows:


Unnamed: 0,Date,Year,Locality,Estimated Value,Sale Price,Property,Residential,num_rooms,num_bathrooms,carpet_area,property_tax_rate,Face
0,2009-01-02,2009,Waterbury,111440.0,185000.0,Single Family,Detached House,3,3,996.0,1.025953,South
1,2009-01-02,2009,Bridgeport,124670.0,150000.0,Two Family,Duplex,4,3,1241.0,1.025953,South
2,2009-01-02,2009,Waterbury,55720.0,140000.0,Single Family,Detached House,3,2,910.0,1.025953,South
3,2009-01-02,2009,Bridgeport,4775276.0,272900.0,Single Family,Detached House,3,1,971.0,1.025953,East
4,2009-01-02,2009,Bridgeport,112351.0,210000.0,?,Detached House,3,2,1092.0,1.025953,East


## Question 1: How many unknown ("?") values are present in the dataset?

In [2]:
# Count unknown values ("?") in the dataset
unknown_count = (df == '?').sum().sum()
print(f"Number of unknown ('?') values in the dataset: {unknown_count}")

# Show distribution of '?' values across columns
unknown_per_column = (df == '?').sum()
print(f"\nDistribution of '?' values by column:")
for col, count in unknown_per_column.items():
    if count > 0:
        print(f"{col}: {count}")

Number of unknown ('?') values in the dataset: 1823

Distribution of '?' values by column:
Property: 1823


In [13]:
# Let's do a more thorough investigation of Question 1
print("=== DETAILED ANALYSIS OF UNKNOWN VALUES ===")

# Method 1: Count '?' values
question_marks = (df == '?').sum().sum()
print(f"Method 1 - Count of '?' values: {question_marks}")

# Method 2: Count empty strings
empty_strings = (df == '').sum().sum()
print(f"Method 2 - Count of empty strings: {empty_strings}")

# Method 3: Count original NaN values (before replacement)
original_nans = df.isnull().sum().sum()
print(f"Method 3 - Original NaN values: {original_nans}")

# Method 4: Check for other unknown representations
print("\n=== CHECKING FOR OTHER UNKNOWN VALUE REPRESENTATIONS ===")
# Check for various representations of missing values
unknown_representations = ['?', '', 'Unknown', 'unknown', 'UNKNOWN', 'N/A', 'NA', 'null', 'NULL', 'None']

total_unknown = 0
for unknown_val in unknown_representations:
    count = (df == unknown_val).sum().sum()
    if count > 0:
        print(f"'{unknown_val}': {count}")
        total_unknown += count

print(f"\nTotal unknown values (all representations): {total_unknown}")

# Method 5: Let's also check what the total missing values are after replacement
after_replacement_nans = df_cleaned.isnull().sum().sum()
print(f"\nAfter replacing '?' with NaN: {after_replacement_nans}")

# Let's also look at the data types and see if there are any issues
print("\n=== DATA TYPE ANALYSIS ===")
print("Data types:")
print(df.dtypes)

# Check unique values in each column to see if there are other unknown patterns
print("\n=== SAMPLE OF UNIQUE VALUES PER COLUMN (first 10) ===")
for col in df.columns:
    unique_vals = df[col].unique()
    print(f"{col}: {unique_vals[:10] if len(unique_vals) > 10 else unique_vals}")

=== DETAILED ANALYSIS OF UNKNOWN VALUES ===
Method 1 - Count of '?' values: 1823
Method 2 - Count of empty strings: 0
Method 3 - Original NaN values: 3725

=== CHECKING FOR OTHER UNKNOWN VALUE REPRESENTATIONS ===
'?': 1823

Total unknown values (all representations): 1823

After replacing '?' with NaN: 5548

=== DATA TYPE ANALYSIS ===
Data types:
Date                  object
Year                   int64
Locality              object
Estimated Value      float64
Sale Price           float64
Property              object
Residential           object
num_rooms              int64
num_bathrooms          int64
carpet_area          float64
property_tax_rate    float64
Face                  object
dtype: object

=== SAMPLE OF UNIQUE VALUES PER COLUMN (first 10) ===
Date: ['2009-01-02' '2009-01-03' '2009-01-04' '2009-01-05' '2009-01-08'
 '2009-01-09' '2009-01-10' '2009-01-11' '2009-01-12' '2009-01-16']
Year: [2009 2010 2011 2012 2013 2014 2015 2016 2017 2018]
Locality: ['Waterbury' 'Bridgeport' '

In [14]:
# CLARIFICATION FOR QUESTION 1
print("=== QUESTION 1 CLARIFICATION ===")
print("The question asks: 'How many unknown (\"?\") values are present in the dataset?'")
print()
print("There are two ways to interpret this:")
print(f"1. Strictly '?' values only: {(df == '?').sum().sum()}")
print(f"2. All unknown values (including original NaN + '?'): {df_cleaned.isnull().sum().sum()}")
print()
print("Original dataset had:")
print(f"- NaN values: {df.isnull().sum().sum()}")
print(f"- '?' values: {(df == '?').sum().sum()}")
print(f"- Total unknown: {df.isnull().sum().sum() + (df == '?').sum().sum()}")
print()
print("After replacing '?' with NaN:")
print(f"- Total NaN values: {df_cleaned.isnull().sum().sum()}")
print()
print("ANSWER: If we interpret 'unknown values' as ALL missing/unknown data,")
print(f"the answer is {df_cleaned.isnull().sum().sum()}")

=== QUESTION 1 CLARIFICATION ===
The question asks: 'How many unknown ("?") values are present in the dataset?'

There are two ways to interpret this:
1. Strictly '?' values only: 1823
2. All unknown values (including original NaN + '?'): 5548

Original dataset had:
- NaN values: 3725
- '?' values: 1823
- Total unknown: 5548

After replacing '?' with NaN:
- Total NaN values: 5548

ANSWER: If we interpret 'unknown values' as ALL missing/unknown data,
the answer is 5548


In [15]:
# DEFINITIVE ANSWER FOR QUESTION 1
print("="*60)
print("QUESTION 1 FINAL ANSWER")
print("="*60)
print('Question: How many unknown ("?") values are present in the dataset?')
print('Instruction: Remove/Delete unknown ("?") values to make it null value')
print('Note: Remove/Delete means it will show NAN in place of "?"')
print()

# Step 1: Load fresh dataset to ensure clean analysis
df_fresh = pd.read_csv('Week1_GA_dataset.csv')
print(f"Original dataset shape: {df_fresh.shape}")

# Step 2: Count literal "?" values
literal_question_marks = (df_fresh == '?').sum().sum()
print(f"Literal '?' values in dataset: {literal_question_marks}")

# Step 3: Count original NaN values  
original_nan_count = df_fresh.isnull().sum().sum()
print(f"Original NaN values in dataset: {original_nan_count}")

# Step 4: Total unknown values (this is what the question is asking for)
total_unknown_values = literal_question_marks + original_nan_count
print(f"Total unknown values (NaN + '?'): {total_unknown_values}")

# Step 5: Replace "?" with NaN as instructed
df_processed = df_fresh.replace('?', pd.NA)
final_nan_count = df_processed.isnull().sum().sum()
print(f"After replacing '?' with NaN: {final_nan_count}")

print()
print("="*60)
print("FINAL ANSWER:")
print(f"Number of unknown values in the dataset: {total_unknown_values}")
print("="*60)

QUESTION 1 FINAL ANSWER
Question: How many unknown ("?") values are present in the dataset?
Instruction: Remove/Delete unknown ("?") values to make it null value
Note: Remove/Delete means it will show NAN in place of "?"

Original dataset shape: (10000, 12)
Literal '?' values in dataset: 1823
Original NaN values in dataset: 3725
Total unknown values (NaN + '?'): 5548
After replacing '?' with NaN: 5548

FINAL ANSWER:
Number of unknown values in the dataset: 5548


In [3]:
# Replace '?' with NaN values
df_cleaned = df.replace('?', np.nan)

# Verify the replacement
print(f"Number of NaN values after replacement: {df_cleaned.isnull().sum().sum()}")
print(f"Number of '?' values remaining: {(df_cleaned == '?').sum().sum()}")

Number of NaN values after replacement: 5548
Number of '?' values remaining: 0


## Question 2: What is the shape of the dataset?

In [4]:
# Check the shape of the original dataset
print(f"Shape of the original dataset: {df.shape}")
print(f"Shape of the cleaned dataset: {df_cleaned.shape}")

Shape of the original dataset: (10000, 12)
Shape of the cleaned dataset: (10000, 12)


## Question 3: What is the value present at the 692nd indexed row and 0th indexed column?

In [5]:
# Get value at row 692, column 0
value_692_0 = df_cleaned.iloc[692, 0]
print(f"Value at row 692, column 0: {value_692_0}")

# Show the column name for reference
print(f"Column 0 name: {df_cleaned.columns[0]}")

Value at row 692, column 0: 2009-11-16
Column 0 name: Date


## Question 4: What is the value present at the 546th indexed row and 7th indexed column?

In [6]:
# Get value at row 546, column 7
value_546_7 = df_cleaned.iloc[546, 7]
print(f"Value at row 546, column 7: {value_546_7}")

# Show the column name for reference
print(f"Column 7 name: {df_cleaned.columns[7]}")

Value at row 546, column 7: 3
Column 7 name: num_rooms


## Question 5: What are the unique values present in the Locality feature?

In [7]:
# Get unique values in Locality column (excluding NaN)
unique_localities = df_cleaned['Locality'].dropna().unique()
print(f"Unique values in Locality feature:")
print(sorted(unique_localities))
print(f"\nNumber of unique localities: {len(unique_localities)}")

Unique values in Locality feature:
['Bridgeport', 'Fairfield', 'Greenwich', 'Norwalk', 'Stamford', 'Waterbury', 'West Hartford']

Number of unique localities: 7


## Question 6: Which features have missing (NaN) values in the dataset?

In [8]:
# Check for missing values in each column
missing_values = df_cleaned.isnull().sum()
features_with_missing = missing_values[missing_values > 0]

print("Features with missing (NaN) values:")
for feature, count in features_with_missing.items():
    print(f"{feature}: {count} missing values")

print(f"\nFeatures with missing values: {list(features_with_missing.index)}")

Features with missing (NaN) values:
Locality: 1253 missing values
Estimated Value: 1243 missing values
Property: 1823 missing values
carpet_area: 1229 missing values

Features with missing values: ['Locality', 'Estimated Value', 'Property', 'carpet_area']


## Question 7: Which feature has the most missing (NaN) values?

In [9]:
# Find the feature with the most missing values
feature_most_missing = missing_values.idxmax()
max_missing_count = missing_values.max()

print(f"Feature with most missing values: {feature_most_missing}")
print(f"Number of missing values: {max_missing_count}")

# Show top features by missing value count
print(f"\nTop features by missing value count:")
print(missing_values.sort_values(ascending=False).head())

Feature with most missing values: Property
Number of missing values: 1823

Top features by missing value count:
Property           1823
Locality           1253
Estimated Value    1243
carpet_area        1229
Date                  0
dtype: int64


## Question 8: Drop samples with missing values strictly greater than 2. How many samples remain?

In [10]:
# Count missing values per row
missing_per_row = df_cleaned.isnull().sum(axis=1)

# Show distribution of missing values per row
print("Distribution of missing values per row:")
print(missing_per_row.value_counts().sort_index())

# Drop rows with missing values > 2
df_filtered = df_cleaned[missing_per_row <= 2]

print(f"\nOriginal dataset size: {len(df_cleaned)}")
print(f"Samples remaining after dropping rows with >2 missing values: {len(df_filtered)}")
print(f"Samples dropped: {len(df_cleaned) - len(df_filtered)}")

Distribution of missing values per row:
0    5449
1    3637
2     831
3      83
Name: count, dtype: int64

Original dataset size: 10000
Samples remaining after dropping rows with >2 missing values: 9917
Samples dropped: 83


## Question 9: Drop all samples with missing values. How many samples remain?

In [11]:
# Drop all rows with any missing values
df_no_missing = df_cleaned.dropna()

print(f"Original dataset size: {len(df_cleaned)}")
print(f"Samples remaining after dropping all rows with missing values: {len(df_no_missing)}")
print(f"Samples dropped: {len(df_cleaned) - len(df_no_missing)}")

# Verify no missing values remain
print(f"\nMissing values in final dataset: {df_no_missing.isnull().sum().sum()}")

Original dataset size: 10000
Samples remaining after dropping all rows with missing values: 5449
Samples dropped: 4551

Missing values in final dataset: 0


## Summary of All Answers

In [12]:
print("=== FINAL ANSWERS ===")
print(f"1. Number of unknown ('?') values: {unknown_count}")
print(f"2. Shape of dataset: {df.shape}")
print(f"3. Value at [692,0]: {value_692_0}")
print(f"4. Value at [546,7]: {value_546_7}")
print(f"5. Unique localities: {sorted(unique_localities)}")
print(f"6. Features with missing values: {list(features_with_missing.index)}")
print(f"7. Feature with most missing values: {feature_most_missing}")
print(f"8. Samples remaining after dropping rows with >2 missing: {len(df_filtered)}")
print(f"9. Samples remaining after dropping all missing: {len(df_no_missing)}")

=== FINAL ANSWERS ===
1. Number of unknown ('?') values: 1823
2. Shape of dataset: (10000, 12)
3. Value at [692,0]: 2009-11-16
4. Value at [546,7]: 3
5. Unique localities: ['Bridgeport', 'Fairfield', 'Greenwich', 'Norwalk', 'Stamford', 'Waterbury', 'West Hartford']
6. Features with missing values: ['Locality', 'Estimated Value', 'Property', 'carpet_area']
7. Feature with most missing values: Property
8. Samples remaining after dropping rows with >2 missing: 9917
9. Samples remaining after dropping all missing: 5449
