## Assignment 2 - Lets Play with Cat Data :)

In [2]:
import pandas as pd
import numpy as np

np.random.seed(123) #randomization

In [3]:
# Generate numerical columns with NaN values
num1 = np.random.randn(500)
num2 = np.random.uniform(10, 50, size=500)
num3 = np.random.randint(1, 6, size=500).astype('float')
num1[np.random.choice(500, 60, replace=False)] = np.nan
num2[np.random.choice(500, 60, replace=False)] = np.nan
num3[np.random.choice(500, 60, replace=False)] = np.nan

In [4]:
# Generate messy categorical/text columns
cat1 = np.random.choice(['apple', 'banana', 'grape', np.nan], size=500, p=[0.3, 0.3, 0.3, 0.1])
cat2 = np.random.choice(['A', 'B', 'C', 'D'], size=500)
cat3 = np.random.choice(['Yes', 'No', np.nan], size=500, p=[0.45, 0.45, 0.10])

In [5]:
# Combine into DataFrame and add untidiness
df_untidy = pd.DataFrame({
    'Score': num1,
    'Height_cm': num2,
    'Rating': num3,
    'Fruit': cat1,
    'Group': cat2,
    'IsActive': cat3
})

# Add untidy issues:
df_untidy.loc[df_untidy.sample(frac=0.15, random_state=1).index, 'Height_cm'] = \
    df_untidy['Height_cm'].dropna().astype(str) + 'cm'   # Mix data type in Height_cm

df_untidy.loc[df_untidy.sample(frac=0.15, random_state=2).index, 'Rating'] = \
    'Rating: ' + df_untidy['Rating'].dropna().astype(str) # Prefix string for some ratings

df_untidy.head()

 '47.7445897977863cm' '12.199264419087633cm' '28.255294989080216cm'
 '49.46319556401813cm' '24.62082473109778cm' '14.069842039264948cm'
 '30.047591231707173cm' '39.596908878071915cm' '47.903273405497885cm'
 '18.383678364591226cm' '20.573119172482315cm' '26.918929348570554cm'
 '49.60956000856316cm' '42.872544601068384cm' '47.28354972175643cm' nan
 nan '36.95711914375434cm' nan '25.124837252276944cm'
 '41.69467488795425cm' '28.67398840853555cm' '31.546329651552767cm'
 '14.629781437963878cm' '10.509502547356245cm' '18.213990989134608cm'
 '24.690622519316705cm' '23.881006122675878cm' '16.30134384410546cm' nan
 '29.66207244015815cm' nan '45.91010976816148cm' nan
 '31.599083317303908cm' '25.60501244185592cm' '43.06120649429532cm' nan
 '12.847775069640711cm' '12.664116985961948cm' '47.658679896221cm'
 '33.2449203552472cm' '39.08764020242252cm' '28.33330050691903cm'
 '22.962576174780793cm' '27.894375469792145cm' '32.038131009330215cm'
 '14.575393291450958cm' nan nan '11.712756967808463cm' nan


Unnamed: 0,Score,Height_cm,Rating,Fruit,Group,IsActive
0,-1.085631,,2.0,banana,D,Yes
1,0.997345,16.480034,Rating: 5.0,apple,A,No
2,0.282978,49.244711,,banana,B,No
3,-1.506295,,3.0,grape,D,
4,-0.5786,31.599083317303908cm,,banana,C,No


- Q1. Identify columns with missing values and demonstrate at least two methods for imputing or filling these missing values (e.g., mean for numerics, mode for categoricals).

In [6]:
# 1. Identify missing values
missing_counts = df_untidy.isna().sum()
print("Missing Values per Column:\n", missing_counts)

# --- Imputation Method 1: Mean for numeric columns
df_imputed_mean = df_untidy.copy()
numeric_cols = df_imputed_mean.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    df_imputed_mean[col].fillna(df_imputed_mean[col].mean(), inplace=True)

# --- Imputation Method 2: Mode for categorical columns
categorical_cols = df_imputed_mean.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df_imputed_mean[col].fillna(df_imputed_mean[col].mode()[0], inplace=True)

print("\nMissing Values after Imputation:\n", df_imputed_mean.isna().sum())

Missing Values per Column:
 Score        60
Height_cm    60
Rating       60
Fruit         0
Group         0
IsActive      0
dtype: int64

Missing Values after Imputation:
 Score        0
Height_cm    0
Rating       0
Fruit        0
Group        0
IsActive     0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_imputed_mean[col].fillna(df_imputed_mean[col].mean(), inplace=True)
  df_imputed_mean[col].fillna(df_imputed_mean[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_imputed_mean[col].fillna(df_imputed_mean[col].mode()[0], inplace=True)


- Q2.  Identify columns with non-numeric (categorical) data and convert them into a numeric format using encoding techniques such as one-hot encoding or label encoding.

In [7]:
from sklearn.preprocessing import LabelEncoder

df_encoded = df_imputed_mean.copy()

# Identify categorical columns
categorical_cols = df_encoded.select_dtypes(include=['object']).columns
print("Categorical Columns:", categorical_cols.tolist())

# Apply Label Encoding for binary column (IsActive)
label_enc = LabelEncoder()
df_encoded['IsActive'] = label_enc.fit_transform(df_encoded['IsActive'])

# Apply One-Hot Encoding for other categorical columns (Fruit, Group)
df_encoded = pd.get_dummies(df_encoded, columns=['Fruit', 'Group'], drop_first=True)

print("\nEncoded DataFrame Shape:", df_encoded.shape)
df_encoded.head()


Categorical Columns: ['Height_cm', 'Rating', 'Fruit', 'Group', 'IsActive']

Encoded DataFrame Shape: (500, 10)


Unnamed: 0,Score,Height_cm,Rating,IsActive,Fruit_banana,Fruit_grape,Fruit_nan,Group_B,Group_C,Group_D
0,-1.085631,16.480034,2.0,1,True,False,False,False,False,True
1,0.997345,16.480034,Rating: 5.0,0,False,False,False,False,False,False
2,0.282978,49.244711,3.0,0,True,False,False,True,False,False
3,-1.506295,16.480034,3.0,2,False,True,False,False,False,True
4,-0.5786,31.599083317303908cm,3.0,0,True,False,False,False,True,False


- Q3. Detect any columns in the DataFrame that contain mixed data types (such as numbers stored as strings or strings with prefixes). Write code to clean and convert these columns to appropriate, consistent types.

In [8]:
# Make a copy to work on
df_cleaned = df_encoded.copy()

# --- Clean Height_cm (remove 'cm' suffix and convert to float)
df_cleaned['Height_cm'] = df_cleaned['Height_cm'].astype(str).str.replace('cm', '', regex=False)
df_cleaned['Height_cm'] = pd.to_numeric(df_cleaned['Height_cm'], errors='coerce')

# --- Clean Rating (remove 'Rating: ' prefix and convert to float)
df_cleaned['Rating'] = df_cleaned['Rating'].astype(str).str.replace('Rating: ', '', regex=False)
df_cleaned['Rating'] = pd.to_numeric(df_cleaned['Rating'], errors='coerce')

print(df_cleaned.dtypes)


Score           float64
Height_cm       float64
Rating          float64
IsActive          int64
Fruit_banana       bool
Fruit_grape        bool
Fruit_nan          bool
Group_B            bool
Group_C            bool
Group_D            bool
dtype: object


- Q4. Apply scaling and/or normalization techniques (such as Min-Max Scaling and Standardization) to the numerical columns to prepare them for downstream machine learning tasks.

In [9]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

df_scaled = df_cleaned.copy()
numerical_cols = df_scaled.select_dtypes(include=[np.number]).columns

# --- Min-Max Scaling
scaler_minmax = MinMaxScaler()
df_scaled_minmax = df_scaled.copy()
df_scaled_minmax[numerical_cols] = scaler_minmax.fit_transform(df_scaled_minmax[numerical_cols])

# --- Standardization (Z-score scaling)
scaler_std = StandardScaler()
df_scaled_std = df_scaled.copy()
df_scaled_std[numerical_cols] = scaler_std.fit_transform(df_scaled_std[numerical_cols])

print("\nMin-Max Scaled Head:\n", df_scaled_minmax[numerical_cols].head())
print("\nStandardized Head:\n", df_scaled_std[numerical_cols].head())



Min-Max Scaled Head:
       Score  Height_cm  Rating  IsActive
0  0.346613   0.162150    0.25       0.5
1  0.683137   0.162150    1.00       0.0
2  0.567725   0.982630    0.50       0.0
3  0.278651   0.162150    0.50       1.0
4  0.428529   0.540755    0.50       0.0

Standardized Head:
       Score  Height_cm    Rating  IsActive
0 -1.145411  -0.989366 -0.783319  0.606537
1  1.088461  -0.989366  1.520560 -0.964803
2  0.322344   1.814000 -0.015359 -0.964803
3 -1.596549  -0.989366 -0.015359  2.177878
4 -0.601650   0.304229 -0.015359 -0.964803


- Q5. Write a function to check for and report any remaining inconsistencies (missing values, mixed types, out-of-range values) in the cleaned DataFrame. Validate that the preprocessing steps have successfully prepared the data for analysis.

In [11]:
def validate_dataframe(df):
    report = {}

    # 1. Missing values check
    missing = df.isna().sum()
    report['missing_values'] = missing[missing > 0].to_dict()

    # 2. Mixed data types check (if object column should be numeric)
    mixed_types = {}
    for col in df.columns:
        if df[col].dtype == 'object':
            try:
                pd.to_numeric(df[col])
            except:
                mixed_types[col] = "Contains non-numeric values"
    report['mixed_types'] = mixed_types

    # 3. Out-of-range values (example: Height cannot be negative)
    if 'Height_cm' in df.columns:
        report['negative_heights'] = df[df['Height_cm'] < 0]['Height_cm'].count()

    return report


# Validate cleaned data
validation_report = validate_dataframe(df_scaled_minmax)
print("\nValidation Report:\n", validation_report)



Validation Report:
 {'missing_values': {}, 'mixed_types': {}, 'negative_heights': np.int64(0)}
