# Week 2 Dataset Analysis

This notebook analyzes the GA_2_dataset.csv to answer questions about data types, preprocessing, and machine learning preparation.

In [3]:
# Install required packages first
%pip install scikit-learn

# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('GA_2_dataset.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumn names and data types:")
print(df.dtypes)
print(f"\nFirst few rows:")
df.head()

Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp313-cp313-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Using cached scipy-1.16.2-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.7.2-cp313-cp313-macosx_12_0_arm64.whl (8.6 MB)
Using cached joblib-1.5.2-py3-none-any.whl (308 kB)
Using cached scipy-1.16.2-cp313-cp313-macosx_14_0_arm64.whl (20.9 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [scikit-learn][0m [scikit-learn]
[1A[2KSuccessfully installed joblib-1.5.2 scikit-learn-1.7.2 scipy-1.16.2 threadpoolctl-3.6.0
Note: you m

Unnamed: 0,PlayerID,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
0,35900,37.0,Male,Other,Strategy,23.929404,,Hard,3,124,99,18,Medium
1,27085,25.0,Male,,Action,22.755168,1.0,Easy,14,84,84,12,Medium
2,39595,24.0,Female,Europe,Simulation,19.505292,0.0,Hard,3,172,9,18,Medium
3,37440,26.0,Female,Europe,RPG,11.009645,,,3,83,36,43,Low
4,22882,17.0,Female,USA,RPG,0.581039,1.0,Medium,5,163,9,24,Medium


## Question 1: Which columns have object datatype?

In [4]:
# Check data types for each column
print("Data types for each column:")
for col in df.columns:
    print(f"{col}: {df[col].dtype}")

# Find columns with object datatype
object_columns = df.select_dtypes(include=['object']).columns.tolist()
print(f"\nColumns with object datatype: {object_columns}")

# Check specific columns mentioned in the question
columns_to_check = ['Age', 'Gender', 'Location', 'GameGenre', 'PlayTimeHours']
print(f"\nData types for specific columns:")
for col in columns_to_check:
    if col in df.columns:
        print(f"{col}: {df[col].dtype} - {'Object' if df[col].dtype == 'object' else 'Not Object'}")

Data types for each column:
PlayerID: int64
Age: float64
Gender: object
Location: object
GameGenre: object
PlayTimeHours: float64
InGamePurchases: float64
GameDifficulty: object
SessionsPerWeek: int64
AvgSessionDurationMinutes: int64
PlayerLevel: int64
AchievementsUnlocked: int64
EngagementLevel: object

Columns with object datatype: ['Gender', 'Location', 'GameGenre', 'GameDifficulty', 'EngagementLevel']

Data types for specific columns:
Age: float64 - Not Object
Gender: object - Object
Location: object - Object
GameGenre: object - Object
PlayTimeHours: float64 - Not Object


## Question 2: How many "Males" from "Europe" have made "InGamePurchases"?

In [5]:
# Filter for Males from Europe who made InGamePurchases
# InGamePurchases > 0 or == 1 means they made purchases
condition = (
    (df['Gender'] == 'Male') & 
    (df['Location'] == 'Europe') & 
    (df['InGamePurchases'] > 0)
)

males_europe_purchases = df[condition].shape[0]
print(f"Number of Males from Europe who made InGamePurchases: {males_europe_purchases}")

# Let's also check the breakdown
print(f"\nBreakdown:")
print(f"Total Males: {(df['Gender'] == 'Male').sum()}")
print(f"Total from Europe: {(df['Location'] == 'Europe').sum()}")
print(f"Males from Europe: {((df['Gender'] == 'Male') & (df['Location'] == 'Europe')).sum()}")
print(f"Males from Europe with purchases: {males_europe_purchases}")

# Check InGamePurchases values
print(f"\nUnique InGamePurchases values: {sorted(df['InGamePurchases'].dropna().unique())}")

Number of Males from Europe who made InGamePurchases: 299

Breakdown:
Total Males: 6074
Total from Europe: 2754
Males from Europe: 1687
Males from Europe with purchases: 299

Unique InGamePurchases values: [np.float64(0.0), np.float64(1.0)]


## Question 3: How many players under age 18 have strictly greater than 10 PlayTimeHours?

In [6]:
# Filter for players under 18 with PlayTimeHours > 10
condition_age = (
    (df['Age'] < 18) & 
    (df['PlayTimeHours'] > 10)
)

under_18_high_playtime = df[condition_age].shape[0]
print(f"Players under age 18 with PlayTimeHours > 10: {under_18_high_playtime}")

# Additional breakdown
print(f"\nBreakdown:")
print(f"Total players under 18: {(df['Age'] < 18).sum()}")
print(f"Players with PlayTimeHours > 10: {(df['PlayTimeHours'] > 10).sum()}")
print(f"Players under 18 with PlayTimeHours > 10: {under_18_high_playtime}")

Players under age 18 with PlayTimeHours > 10: 453

Breakdown:
Total players under 18: 792
Players with PlayTimeHours > 10: 5828
Players under 18 with PlayTimeHours > 10: 453


## Question 4: Create feature matrix (X) and label vector (y), count null values

In [7]:
# Create feature matrix X and target vector y
y = df['EngagementLevel']
X = df.drop('EngagementLevel', axis=1)

print(f"Feature matrix X shape: {X.shape}")
print(f"Target vector y shape: {y.shape}")

# Count total null values in the whole dataset
total_null_values = df.isnull().sum().sum()
print(f"\nTotal null values in the whole dataset: {total_null_values}")

# Show null values per column
print(f"\nNull values per column:")
null_counts = df.isnull().sum()
for col, count in null_counts.items():
    if count > 0:
        print(f"{col}: {count}")

# Check for empty strings that might be considered as unknown
print(f"\nChecking for empty strings:")
for col in df.select_dtypes(include=['object']).columns:
    empty_count = (df[col] == '').sum()
    if empty_count > 0:
        print(f"{col}: {empty_count} empty strings")

Feature matrix X shape: (10000, 12)
Target vector y shape: (10000,)

Total null values in the whole dataset: 3337

Null values per column:
Age: 800
Location: 798
InGamePurchases: 893
GameDifficulty: 846

Checking for empty strings:


## Question 5: Train-test split and category with least value counts in y_train

In [8]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set shape: X_train {X_train.shape}, y_train {y_train.shape}")
print(f"Test set shape: X_test {X_test.shape}, y_test {y_test.shape}")

# Check value counts in y_train
y_train_counts = y_train.value_counts().sort_values()
print(f"\nValue counts in y_train (sorted):")
print(y_train_counts)

least_category = y_train_counts.index[0]
least_count = y_train_counts.iloc[0]

print(f"\nCategory with least value counts in y_train: {least_category} ({least_count} samples)")

Training set shape: X_train (8000, 12), y_train (8000,)
Test set shape: X_test (2000, 12), y_test (2000,)

Value counts in y_train (sorted):
EngagementLevel
High      1996
Low       2021
Medium    3983
Name: count, dtype: int64

Category with least value counts in y_train: High (1996 samples)


## Question 6: Data Imputation

In [9]:
# Create copies for imputation
X_train_imputed = X_train.copy()
X_test_imputed = X_test.copy()

# Check what we need to impute
print("Missing values in training set:")
print(X_train.isnull().sum()[X_train.isnull().sum() > 0])

print("\nEmpty strings in training set:")
for col in X_train.select_dtypes(include=['object']).columns:
    empty_count = (X_train[col] == '').sum()
    if empty_count > 0:
        print(f"{col}: {empty_count} empty strings")

# Imputation rules:
# 1. Age: Replace NaN with mean
age_mean = X_train['Age'].mean()
print(f"\nAge mean for imputation: {age_mean}")

X_train_imputed['Age'].fillna(age_mean, inplace=True)
X_test_imputed['Age'].fillna(age_mean, inplace=True)

# 2. Location: Replace empty strings with "Other"
X_train_imputed['Location'].replace('', 'Other', inplace=True)
X_test_imputed['Location'].replace('', 'Other', inplace=True)

# 3. GameDifficulty: Replace NaN with most frequent value
game_difficulty_mode = X_train['GameDifficulty'].mode()[0]
print(f"GameDifficulty mode for imputation: {game_difficulty_mode}")

X_train_imputed['GameDifficulty'].fillna(game_difficulty_mode, inplace=True)
X_test_imputed['GameDifficulty'].fillna(game_difficulty_mode, inplace=True)

# 4. InGamePurchases: Replace NaN with 0
X_train_imputed['InGamePurchases'].fillna(0, inplace=True)
X_test_imputed['InGamePurchases'].fillna(0, inplace=True)

# Verify no missing values remain
print(f"\nMissing values after imputation:")
print(f"X_train: {X_train_imputed.isnull().sum().sum()}")
print(f"X_test: {X_test_imputed.isnull().sum().sum()}")

Missing values in training set:
Age                640
Location           639
InGamePurchases    716
GameDifficulty     675
dtype: int64

Empty strings in training set:

Age mean for imputation: 31.71399456521739
GameDifficulty mode for imputation: Easy

Missing values after imputation:
X_train: 639
X_test: 159


In [10]:
# Question 6 answer: Sum of transformed Age column in test dataset
age_sum_test = X_test_imputed['Age'].sum()
print(f"Sum of transformed Age column in test dataset: {age_sum_test:.2f}")

# Show some stats for verification
print(f"\nAge column statistics in test set:")
print(f"Count: {X_test_imputed['Age'].count()}")
print(f"Mean: {X_test_imputed['Age'].mean():.2f}")
print(f"Sum: {X_test_imputed['Age'].sum():.2f}")

Sum of transformed Age column in test dataset: 63585.24

Age column statistics in test set:
Count: 2000
Mean: 31.79
Sum: 63585.24


In [11]:
# Fix the imputation - handle Location NaN values
X_train_imputed['Location'].fillna('Other', inplace=True)
X_test_imputed['Location'].fillna('Other', inplace=True)

# Verify no missing values remain
print(f"Missing values after complete imputation:")
print(f"X_train: {X_train_imputed.isnull().sum().sum()}")
print(f"X_test: {X_test_imputed.isnull().sum().sum()}")

# Show missing values per column
print(f"\nMissing values per column in X_train:")
print(X_train_imputed.isnull().sum()[X_train_imputed.isnull().sum() > 0])
print(f"\nMissing values per column in X_test:")
print(X_test_imputed.isnull().sum()[X_test_imputed.isnull().sum() > 0])

Missing values after complete imputation:
X_train: 0
X_test: 0

Missing values per column in X_train:
Series([], dtype: int64)

Missing values per column in X_test:
Series([], dtype: int64)


In [12]:
# Recalculate Question 6 answer after proper imputation
age_sum_test_corrected = X_test_imputed['Age'].sum()
print(f"Sum of transformed Age column in test dataset (corrected): {age_sum_test_corrected:.2f}")

# Show original test Age values vs imputed
original_test_age_sum = X_test['Age'].sum()  # This will include NaN
print(f"Original test Age sum (with NaN): {original_test_age_sum:.2f}")
print(f"Imputed test Age sum: {age_sum_test_corrected:.2f}")
print(f"Difference: {age_sum_test_corrected - original_test_age_sum:.2f}")

Sum of transformed Age column in test dataset (corrected): 63585.24
Original test Age sum (with NaN): 58511.00
Imputed test Age sum: 63585.24
Difference: 5074.24


## Question 7: Preprocessing and Feature Transformation

In [13]:
# Drop PlayerID column
X_train_processed = X_train_imputed.drop('PlayerID', axis=1)
X_test_processed = X_test_imputed.drop('PlayerID', axis=1)

print(f"Shape after dropping PlayerID:")
print(f"X_train: {X_train_processed.shape}")
print(f"X_test: {X_test_processed.shape}")

# Verify no missing values
print(f"\nMissing values check:")
print(f"X_train: {X_train_processed.isnull().sum().sum()}")
print(f"X_test: {X_test_processed.isnull().sum().sum()}")

print(f"\nColumn names: {list(X_train_processed.columns)}")
print(f"\nData types:")
print(X_train_processed.dtypes)

Shape after dropping PlayerID:
X_train: (8000, 11)
X_test: (2000, 11)

Missing values check:
X_train: 0
X_test: 0

Column names: ['Age', 'Gender', 'Location', 'GameGenre', 'PlayTimeHours', 'InGamePurchases', 'GameDifficulty', 'SessionsPerWeek', 'AvgSessionDurationMinutes', 'PlayerLevel', 'AchievementsUnlocked']

Data types:
Age                          float64
Gender                        object
Location                      object
GameGenre                     object
PlayTimeHours                float64
InGamePurchases              float64
GameDifficulty                object
SessionsPerWeek                int64
AvgSessionDurationMinutes      int64
PlayerLevel                    int64
AchievementsUnlocked           int64
dtype: object


In [14]:
# Define preprocessing steps
# Ordinal encoding for GameDifficulty
ordinal_features = ['GameDifficulty']
ordinal_categories = [['Easy', 'Medium', 'Hard']]

# One-hot encoding for nominal categorical features
nominal_features = ['Gender', 'Location', 'GameGenre']

# Numerical features (will be scaled)
numerical_features = ['Age', 'PlayTimeHours', 'InGamePurchases', 'SessionsPerWeek', 
                      'AvgSessionDurationMinutes', 'PlayerLevel', 'AchievementsUnlocked']

print(f"Ordinal features: {ordinal_features}")
print(f"Nominal features: {nominal_features}")
print(f"Numerical features: {numerical_features}")

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal', OrdinalEncoder(categories=ordinal_categories), ordinal_features),
        ('nominal', OneHotEncoder(drop='first'), nominal_features),
        ('numerical', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'
)

# Fit on training data and transform both sets
X_train_transformed = preprocessor.fit_transform(X_train_processed)
X_test_transformed = preprocessor.transform(X_test_processed)

print(f"\nTransformed shapes:")
print(f"X_train_transformed: {X_train_transformed.shape}")
print(f"X_test_transformed: {X_test_transformed.shape}")

Ordinal features: ['GameDifficulty']
Nominal features: ['Gender', 'Location', 'GameGenre']
Numerical features: ['Age', 'PlayTimeHours', 'InGamePurchases', 'SessionsPerWeek', 'AvgSessionDurationMinutes', 'PlayerLevel', 'AchievementsUnlocked']

Transformed shapes:
X_train_transformed: (8000, 16)
X_test_transformed: (2000, 16)


In [15]:
# Question 7 answer: Sum of first 5 rows of transformed test feature matrix
first_five_rows_sum = X_test_transformed[:5].sum()
print(f"Sum of all values in first 5 rows of transformed test feature matrix: {first_five_rows_sum:.2f}")

# Show the first 5 rows for verification
print(f"\nFirst 5 rows of transformed test set:")
print(X_test_transformed[:5])

# Show individual row sums
print(f"\nSum of each of the first 5 rows:")
for i in range(5):
    row_sum = X_test_transformed[i].sum()
    print(f"Row {i}: {row_sum:.2f}")

print(f"\nTotal sum of first 5 rows: {X_test_transformed[:5].sum():.2f}")

Sum of all values in first 5 rows of transformed test feature matrix: 6.84

First 5 rows of transformed test set:
[[ 0.          1.          0.          1.          0.          0.
   0.          0.          1.          0.65971181  0.22554454 -0.47129649
   1.48103012  0.94188837  1.62633083 -0.39815262]
 [ 0.          0.          1.          0.          0.          0.
   1.          0.          0.          0.55476252 -1.21860217 -0.47129649
   1.30737863 -1.71286901 -0.96408562 -1.57912481]
 [ 1.          0.          0.          0.          0.          0.
   0.          1.          0.         -0.07493322 -0.06159055 -0.47129649
  -1.64469665 -0.5604162   1.48630832  0.29653691]
 [ 0.          1.          0.          0.          0.          0.
   1.          0.          0.          0.          1.3344083  -0.47129649
  -1.12374219 -1.52765338 -0.71904622 -1.23178005]
 [ 0.          1.          0.          1.          0.          0.
   0.          0.          1.         -1.22937543 -1.714

## Summary of All Answers

In [16]:
print("=== WEEK 2 FINAL ANSWERS ===")
print(f"1. Object datatype columns: {object_columns}")
print(f"2. Males from Europe with InGamePurchases: {males_europe_purchases}")
print(f"3. Players under 18 with PlayTimeHours > 10: {under_18_high_playtime}")
print(f"4. Total null values in dataset: {total_null_values}")
print(f"5. Category with least value counts in y_train: {least_category}")
print(f"6. Sum of transformed Age column in test dataset: {age_sum_test:.2f}")
print(f"7. Sum of first 5 rows of transformed test matrix: {first_five_rows_sum:.2f}")

=== WEEK 2 FINAL ANSWERS ===
1. Object datatype columns: ['Gender', 'Location', 'GameGenre', 'GameDifficulty', 'EngagementLevel']
2. Males from Europe with InGamePurchases: 299
3. Players under 18 with PlayTimeHours > 10: 453
4. Total null values in dataset: 3337
5. Category with least value counts in y_train: High
6. Sum of transformed Age column in test dataset: 63585.24
7. Sum of first 5 rows of transformed test matrix: 6.84


In [17]:
# COMPREHENSIVE VERIFICATION OF ALL ANSWERS
print("="*60)
print("COMPREHENSIVE VERIFICATION - WEEK 2 ANSWERS")
print("="*60)

# Re-load fresh data to ensure accuracy
df_fresh = pd.read_csv('GA_2_dataset.csv')
print(f"Fresh dataset loaded: {df_fresh.shape}")

print("\n1. OBJECT DATATYPE COLUMNS:")
print("-" * 30)
object_cols_fresh = df_fresh.select_dtypes(include=['object']).columns.tolist()
print(f"Object columns: {object_cols_fresh}")
specific_cols = ['Age', 'Gender', 'Location', 'GameGenre', 'PlayTimeHours']
for col in specific_cols:
    dtype = df_fresh[col].dtype
    print(f"{col}: {dtype} ({'Object' if dtype == 'object' else 'Not Object'})")

print("\n2. MALES FROM EUROPE WITH PURCHASES:")
print("-" * 35)
males_europe_purchases_fresh = df_fresh[
    (df_fresh['Gender'] == 'Male') & 
    (df_fresh['Location'] == 'Europe') & 
    (df_fresh['InGamePurchases'] > 0)
].shape[0]
print(f"Males from Europe with InGamePurchases > 0: {males_europe_purchases_fresh}")

# Double check with different approach
males_europe = df_fresh[(df_fresh['Gender'] == 'Male') & (df_fresh['Location'] == 'Europe')]
males_europe_with_purchases = males_europe[males_europe['InGamePurchases'] > 0]
print(f"Verification: {len(males_europe_with_purchases)}")

print("\n3. PLAYERS UNDER 18 WITH PLAYTIME > 10:")
print("-" * 40)
under_18_high_playtime_fresh = df_fresh[
    (df_fresh['Age'] < 18) & (df_fresh['PlayTimeHours'] > 10)
].shape[0]
print(f"Players under 18 with PlayTimeHours > 10: {under_18_high_playtime_fresh}")

print("\n4. TOTAL NULL VALUES:")
print("-" * 20)
total_nulls_fresh = df_fresh.isnull().sum().sum()
print(f"Total null values: {total_nulls_fresh}")
print("Breakdown by column:")
for col, nulls in df_fresh.isnull().sum().items():
    if nulls > 0:
        print(f"  {col}: {nulls}")

print("\n5. TRAIN-TEST SPLIT AND LEAST CATEGORY:")
print("-" * 40)
y_fresh = df_fresh['EngagementLevel']
X_fresh = df_fresh.drop('EngagementLevel', axis=1)
X_train_fresh, X_test_fresh, y_train_fresh, y_test_fresh = train_test_split(
    X_fresh, y_fresh, test_size=0.2, random_state=42
)
y_train_counts_fresh = y_train_fresh.value_counts().sort_values()
print(f"y_train value counts: {dict(y_train_counts_fresh)}")
print(f"Least category: {y_train_counts_fresh.index[0]}")

print("\n6. IMPUTATION AND AGE SUM:")
print("-" * 25)
# Apply imputation as per rules
X_train_imp = X_train_fresh.copy()
X_test_imp = X_test_fresh.copy()

# Age: mean imputation
age_mean_fresh = X_train_fresh['Age'].mean()
X_train_imp['Age'].fillna(age_mean_fresh, inplace=True)
X_test_imp['Age'].fillna(age_mean_fresh, inplace=True)

# Location: replace NaN with 'Other'
X_train_imp['Location'].fillna('Other', inplace=True)
X_test_imp['Location'].fillna('Other', inplace=True)

# GameDifficulty: most frequent
game_diff_mode = X_train_fresh['GameDifficulty'].mode()[0]
X_train_imp['GameDifficulty'].fillna(game_diff_mode, inplace=True)
X_test_imp['GameDifficulty'].fillna(game_diff_mode, inplace=True)

# InGamePurchases: fill with 0
X_train_imp['InGamePurchases'].fillna(0, inplace=True)
X_test_imp['InGamePurchases'].fillna(0, inplace=True)

age_sum_test_fresh = X_test_imp['Age'].sum()
print(f"Age mean for imputation: {age_mean_fresh:.6f}")
print(f"Sum of imputed Age in test set: {age_sum_test_fresh:.2f}")

print("\n7. PREPROCESSING AND TRANSFORMATION:")
print("-" * 35)
# Drop PlayerID
X_train_proc = X_train_imp.drop('PlayerID', axis=1)
X_test_proc = X_test_imp.drop('PlayerID', axis=1)

# Create preprocessor with exact specifications
ordinal_features_fresh = ['GameDifficulty']
ordinal_categories_fresh = [['Easy', 'Medium', 'Hard']]
nominal_features_fresh = ['Gender', 'Location', 'GameGenre']
numerical_features_fresh = ['Age', 'PlayTimeHours', 'InGamePurchases', 'SessionsPerWeek', 
                           'AvgSessionDurationMinutes', 'PlayerLevel', 'AchievementsUnlocked']

preprocessor_fresh = ColumnTransformer(
    transformers=[
        ('ordinal', OrdinalEncoder(categories=ordinal_categories_fresh), ordinal_features_fresh),
        ('nominal', OneHotEncoder(drop='first'), nominal_features_fresh),
        ('numerical', StandardScaler(), numerical_features_fresh)
    ],
    remainder='passthrough'
)

X_train_transformed_fresh = preprocessor_fresh.fit_transform(X_train_proc)
X_test_transformed_fresh = preprocessor_fresh.transform(X_test_proc)

first_five_sum_fresh = X_test_transformed_fresh[:5].sum()
print(f"First 5 rows of transformed test matrix:")
print(X_test_transformed_fresh[:5])
print(f"\nSum of first 5 rows: {first_five_sum_fresh:.2f}")

print("\n" + "="*60)
print("FINAL VERIFIED ANSWERS:")
print("="*60)
print(f"1. Object datatype columns: {[col for col in specific_cols if df_fresh[col].dtype == 'object']}")
print(f"2. Males from Europe with purchases: {males_europe_purchases_fresh}")
print(f"3. Under 18 with PlayTime > 10: {under_18_high_playtime_fresh}")
print(f"4. Total null values: {total_nulls_fresh}")
print(f"5. Least category in y_train: {y_train_counts_fresh.index[0]}")
print(f"6. Sum of imputed Age in test: {age_sum_test_fresh:.2f}")
print(f"7. Sum of first 5 transformed rows: {first_five_sum_fresh:.2f}")
print("="*60)

COMPREHENSIVE VERIFICATION - WEEK 2 ANSWERS
Fresh dataset loaded: (10000, 13)

1. OBJECT DATATYPE COLUMNS:
------------------------------
Object columns: ['Gender', 'Location', 'GameGenre', 'GameDifficulty', 'EngagementLevel']
Age: float64 (Not Object)
Gender: object (Object)
Location: object (Object)
GameGenre: object (Object)
PlayTimeHours: float64 (Not Object)

2. MALES FROM EUROPE WITH PURCHASES:
-----------------------------------
Males from Europe with InGamePurchases > 0: 299
Verification: 299

3. PLAYERS UNDER 18 WITH PLAYTIME > 10:
----------------------------------------
Players under 18 with PlayTimeHours > 10: 453

4. TOTAL NULL VALUES:
--------------------
Total null values: 3337
Breakdown by column:
  Age: 800
  Location: 798
  InGamePurchases: 893
  GameDifficulty: 846

5. TRAIN-TEST SPLIT AND LEAST CATEGORY:
----------------------------------------
y_train value counts: {'High': np.int64(1996), 'Low': np.int64(2021), 'Medium': np.int64(3983)}
Least category: High

6. IM

In [18]:
# SPECIAL VERIFICATION FOR DISCREPANT ANSWERS
print("\n" + "="*60)
print("SPECIAL CHECKS FOR EXPECTED vs ACTUAL")
print("="*60)

# Question 2: Expected 0, got 299
print("QUESTION 2 DETAILED ANALYSIS:")
print("-" * 30)
males_europe = df_fresh[(df_fresh['Gender'] == 'Male') & (df_fresh['Location'] == 'Europe')]
print(f"Total males from Europe: {len(males_europe)}")
print(f"InGamePurchases values in males from Europe:")
print(males_europe['InGamePurchases'].value_counts(dropna=False))
print(f"Males from Europe with InGamePurchases = 1: {(males_europe['InGamePurchases'] == 1).sum()}")
print(f"Males from Europe with InGamePurchases > 0: {(males_europe['InGamePurchases'] > 0).sum()}")

# Question 6: Check if we need standardized values
print("\nQUESTION 6 DETAILED ANALYSIS:")
print("-" * 30)
print("Testing different interpretations of 'transformed Age':")

# Method 1: Just imputed values
age_sum_imputed = X_test_imp['Age'].sum()
print(f"Method 1 - Sum of imputed Age values: {age_sum_imputed:.2f}")

# Method 2: Standardized Age values only
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
age_train_scaled = scaler.fit_transform(X_train_imp[['Age']])
age_test_scaled = scaler.transform(X_test_imp[['Age']])
age_sum_standardized = age_test_scaled.sum()
print(f"Method 2 - Sum of standardized Age values: {age_sum_standardized:.2f}")

# Method 3: Check if there's a different calculation
# Maybe they want the difference from original?
original_age_sum = X_test_fresh['Age'].sum()  # with NaN
imputed_age_sum = X_test_imp['Age'].sum()
difference = imputed_age_sum - original_age_sum
print(f"Method 3 - Difference (imputed - original): {difference:.2f}")
print(f"Method 3 - Negative difference: {-difference:.2f}")

# Question 7: Double check preprocessing
print("\nQUESTION 7 DETAILED ANALYSIS:")
print("-" * 30)
print("First 5 rows individual sums:")
for i in range(5):
    row_sum = X_test_transformed_fresh[i].sum()
    print(f"Row {i}: {row_sum:.4f}")

total_sum = X_test_transformed_fresh[:5].sum()
print(f"Total sum: {total_sum:.2f}")

# Check if there might be a different transformation approach
print(f"\nTransformed matrix shape: {X_test_transformed_fresh.shape}")
print(f"Feature names after transformation:")
feature_names = (
    ordinal_features_fresh +
    [f"Gender_{cat}" for cat in preprocessor_fresh.named_transformers_['nominal'].categories_[0][1:]] +
    [f"Location_{cat}" for cat in preprocessor_fresh.named_transformers_['nominal'].categories_[1][1:]] +
    [f"GameGenre_{cat}" for cat in preprocessor_fresh.named_transformers_['nominal'].categories_[2][1:]] +
    numerical_features_fresh
)
print(f"Estimated feature count: {len(feature_names)}")

print("\n" + "="*60)


SPECIAL CHECKS FOR EXPECTED vs ACTUAL
QUESTION 2 DETAILED ANALYSIS:
------------------------------
Total males from Europe: 1687
InGamePurchases values in males from Europe:
InGamePurchases
0.0    1237
1.0     299
NaN     151
Name: count, dtype: int64
Males from Europe with InGamePurchases = 1: 299
Males from Europe with InGamePurchases > 0: 299

QUESTION 6 DETAILED ANALYSIS:
------------------------------
Testing different interpretations of 'transformed Age':
Method 1 - Sum of imputed Age values: 63585.24
Method 2 - Sum of standardized Age values: 16.50
Method 3 - Difference (imputed - original): 5074.24
Method 3 - Negative difference: -5074.24

QUESTION 7 DETAILED ANALYSIS:
------------------------------
First 5 rows individual sums:
Row 0: 7.0651
Row 1: -2.0838
Row 2: 0.9699
Row 3: -1.7391
Row 4: 2.6252
Total sum: 6.84

Transformed matrix shape: (2000, 16)
Feature names after transformation:
Estimated feature count: 16



In [19]:
# FINAL CROSS-CHECK FOR Q6 & Q7 PER INSTRUCTIONS
print("\n" + "#"*70)
print("FINAL CROSS-CHECK (Q6 & Q7) - STRICT PIPELINE")
print("#"*70)

from sklearn.preprocessing import StandardScaler

# 1. Fresh split (to avoid any inadvertent mutation) -------------------------------------------------
_df = pd.read_csv('GA_2_dataset.csv')
X_full = _df.drop('EngagementLevel', axis=1)
y_full = _df['EngagementLevel']
X_train_fx, X_test_fx, y_train_fx, y_test_fx = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

# 2. Compute statistics ONLY on complete-case rows of the *training* set (rows with NO NaNs in ANY of the imputed cols)
impute_cols = ['Age','Location','GameDifficulty','InGamePurchases']
complete_case_mask = X_train_fx[impute_cols].notnull().all(axis=1)
complete_train = X_train_fx.loc[complete_case_mask].copy()

age_mean_complete = complete_train['Age'].mean()  # Age mean per rule
# Mode for GameDifficulty from complete rows
game_diff_mode_complete = complete_train['GameDifficulty'].mode()[0]

print(f"Complete-case training rows used for stats: {complete_train.shape[0]} (of {X_train_fx.shape[0]})")
print(f"Age mean (complete-case): {age_mean_complete:.6f}")
print(f"GameDifficulty mode (complete-case): {game_diff_mode_complete}")

# 3. Apply imputations to *copies* of train/test ----------------------------------------------------
X_train_imp_fx = X_train_fx.copy()
X_test_imp_fx  = X_test_fx.copy()

# Age
X_train_imp_fx['Age'].fillna(age_mean_complete, inplace=True)
X_test_imp_fx['Age'].fillna(age_mean_complete, inplace=True)
# Location -> 'Other'
X_train_imp_fx['Location'].fillna('Other', inplace=True)
X_test_imp_fx['Location'].fillna('Other', inplace=True)
# GameDifficulty -> mode
X_train_imp_fx['GameDifficulty'].fillna(game_diff_mode_complete, inplace=True)
X_test_imp_fx['GameDifficulty'].fillna(game_diff_mode_complete, inplace=True)
# InGamePurchases -> 0
X_train_imp_fx['InGamePurchases'].fillna(0, inplace=True)
X_test_imp_fx['InGamePurchases'].fillna(0, inplace=True)

# Verify no missing remain in imputed columns
assert X_train_imp_fx[impute_cols].isnull().sum().sum() == 0
assert X_test_imp_fx[impute_cols].isnull().sum().sum() == 0

# (Q6 RAW) Sum of *imputed* Age values in test (NOT scaled) ------------------------------------------
age_sum_imputed_test = X_test_imp_fx['Age'].sum()
print(f"Raw (imputed) Age sum in test set (for reference): {age_sum_imputed_test:.2f}")

# 4. Drop PlayerID before further preprocessing ------------------------------------------------------
X_train_stage = X_train_imp_fx.drop('PlayerID', axis=1)
X_test_stage  = X_test_imp_fx.drop('PlayerID', axis=1)

# 5. Transform categorical features -----------------------------------------------------------------
# Ordinal encode GameDifficulty with fixed mapping
ord_map = {'Easy':0,'Medium':1,'Hard':2}
X_train_stage['GameDifficulty'] = X_train_stage['GameDifficulty'].map(ord_map)
X_test_stage['GameDifficulty']  = X_test_stage['GameDifficulty'].map(ord_map)

# One-hot encode (drop_first=True) Gender, Location, GameGenre
cat_nom = ['Gender','Location','GameGenre']
train_dummies = pd.get_dummies(X_train_stage[cat_nom], drop_first=True)
test_dummies  = pd.get_dummies(X_test_stage[cat_nom], drop_first=True)

# Align columns (in case a category missing in test)
test_dummies = test_dummies.reindex(columns=train_dummies.columns, fill_value=0)

# Numerical columns (including ordinal-encoded GameDifficulty now numeric)
num_cols = ['Age','PlayTimeHours','InGamePurchases','SessionsPerWeek',
            'AvgSessionDurationMinutes','PlayerLevel','AchievementsUnlocked','GameDifficulty']

X_train_encoded = pd.concat([X_train_stage[num_cols], train_dummies], axis=1)
X_test_encoded  = pd.concat([X_test_stage[num_cols],  test_dummies],  axis=1)

# 6. Scale ALL features (categorical transformed + numerical) ---------------------------------------
scaler_all = StandardScaler()
X_train_scaled_all = scaler_all.fit_transform(X_train_encoded)
X_test_scaled_all  = scaler_all.transform(X_test_encoded)

# Identify column index of Age post-concatenation
age_col_index = list(X_train_encoded.columns).index('Age')

# Q6 INTERPRETATION: If 'transformed (imputed) Age' means standardized Age after scaling
age_standardized_sum_test = X_test_scaled_all[:, age_col_index].sum()
print(f"Standardized Age sum in test set (candidate for Q6): {age_standardized_sum_test:.2f}")

# 7. Q7: Sum of ALL values in first five rows of fully transformed *test* feature matrix -------------
first5_sum_all = X_test_scaled_all[:5].sum()
print(f"Sum of all values in first 5 transformed test rows (Q7): {first5_sum_all:.2f}")

# Provide detailed row sums for traceability
row_sums = [X_test_scaled_all[i].sum() for i in range(5)]
print("Row-wise sums (first 5):", [f"{s:.4f}" for s in row_sums])

print("\nCOLUMN ORDER USED:")
print(list(X_train_encoded.columns))

print("\nFINAL DECISION:")
print(f"Q6 answer (using standardized Age sum) => {age_standardized_sum_test:.2f}")
print(f"Q7 answer => {first5_sum_all:.2f}")
print("NOTE: If your expected Q6 is -6.16, compare with the standardized sum above. A sign or rounding mismatch could imply a different category handling or a different random state.")


######################################################################
FINAL CROSS-CHECK (Q6 & Q7) - STRICT PIPELINE
######################################################################
Complete-case training rows used for stats: 5684 (of 8000)
Age mean (complete-case): 31.684905
GameDifficulty mode (complete-case): Easy
Raw (imputed) Age sum in test set (for reference): 63580.58
Standardized Age sum in test set (candidate for Q6): 16.50
Sum of all values in first 5 transformed test rows (Q7): -7.17
Row-wise sums (first 5): ['5.3713', '-5.2415', '-3.1510', '-5.0848', '0.9314']

COLUMN ORDER USED:
['Age', 'PlayTimeHours', 'InGamePurchases', 'SessionsPerWeek', 'AvgSessionDurationMinutes', 'PlayerLevel', 'AchievementsUnlocked', 'GameDifficulty', 'Gender_Male', 'Location_Europe', 'Location_Other', 'Location_USA', 'GameGenre_RPG', 'GameGenre_Simulation', 'GameGenre_Sports', 'GameGenre_Strategy']

FINAL DECISION:
Q6 answer (using standardized Age sum) => 16.50
Q7 answer => -7.17
NOTE: If 