In [None]:
import pandas as pd
import numpy as np

np.random.seed(123)

# Generate numerical columns with NaN values
num1 = np.random.randn(500)
num2 = np.random.uniform(10, 50, size=500)
num3 = np.random.randint(1, 6, size=500).astype('float')
num1[np.random.choice(500, 60, replace=False)] = np.nan
num2[np.random.choice(500, 60, replace=False)] = np.nan
num3[np.random.choice(500, 60, replace=False)] = np.nan

# Generate messy categorical/text columns
cat1 = np.random.choice(['apple', 'banana', 'grape', np.nan], size=500, p=[0.3, 0.3, 0.3, 0.1])
cat2 = np.random.choice(['A', 'B', 'C', 'D'], size=500)
cat3 = np.random.choice(['Yes', 'No', np.nan], size=500, p=[0.45, 0.45, 0.10])

# Combine into DataFrame and add untidiness
df_untidy = pd.DataFrame({
    'Score': num1,
    'Height_cm': num2,
    'Rating': num3,
    'Fruit': cat1,
    'Group': cat2,
    'IsActive': cat3
})

# Add untidy issues:
df_untidy.loc[df_untidy.sample(frac=0.15, random_state=1).index, 'Height_cm'] = \
    df_untidy['Height_cm'].dropna().astype(str) + 'cm'   # Mix data type in Height_cm

df_untidy.loc[df_untidy.sample(frac=0.15, random_state=2).index, 'Rating'] = \
    'Rating: ' + df_untidy['Rating'].dropna().astype(str) # Prefix string for some ratings

df_untidy.head()

 '47.7445897977863cm' '12.199264419087633cm' '28.255294989080216cm'
 '49.46319556401813cm' '24.62082473109778cm' '14.069842039264948cm'
 '30.047591231707173cm' '39.596908878071915cm' '47.903273405497885cm'
 '18.383678364591226cm' '20.573119172482315cm' '26.918929348570554cm'
 '49.60956000856316cm' '42.872544601068384cm' '47.28354972175643cm' nan
 nan '36.95711914375434cm' nan '25.124837252276944cm'
 '41.69467488795425cm' '28.67398840853555cm' '31.546329651552767cm'
 '14.629781437963878cm' '10.509502547356245cm' '18.213990989134608cm'
 '24.690622519316705cm' '23.881006122675878cm' '16.30134384410546cm' nan
 '29.66207244015815cm' nan '45.91010976816148cm' nan
 '31.599083317303908cm' '25.60501244185592cm' '43.06120649429532cm' nan
 '12.847775069640711cm' '12.664116985961948cm' '47.658679896221cm'
 '33.2449203552472cm' '39.08764020242252cm' '28.33330050691903cm'
 '22.962576174780793cm' '27.894375469792145cm' '32.038131009330215cm'
 '14.575393291450958cm' nan nan '11.712756967808463cm' nan


Unnamed: 0,Score,Height_cm,Rating,Fruit,Group,IsActive
0,-1.085631,,2.0,banana,D,Yes
1,0.997345,16.480034,Rating: 5.0,apple,A,No
2,0.282978,49.244711,,banana,B,No
3,-1.506295,,3.0,grape,D,
4,-0.5786,31.599083317303908cm,,banana,C,No


In [None]:
#1.Q1. Identify columns with missing values and demonstrate at least two methods for imputing or filling these missing values (e.g., mean for numerics, mode for categoricals).
print("Missing values before imputation:")
print(df_untidy.isnull().sum())

df_q1 = df_untidy.copy()
df_q1['Score'] = df_q1['Score'].fillna(df_q1['Score'].mean())
df_q1['Height_cm'] = df_q1['Height_cm'].fillna(df_q1['Height_cm'].mode()[0])
df_q1['Rating'] = df_q1['Rating'].fillna(df_q1['Rating'].mode()[0])
df_q1['Fruit'] = df_q1['Fruit'].fillna(df_q1['Fruit'].mode()[0])
df_q1['IsActive'] = df_q1['IsActive'].fillna('Unknown')
print("\nMissing values after imputation:")
print(df_q1.isnull().sum())

Missing values before imputation:
Score        60
Height_cm    60
Rating       60
Fruit         0
Group         0
IsActive      0
dtype: int64

Missing values after imputation:
Score        0
Height_cm    0
Rating       0
Fruit        0
Group        0
IsActive     0
dtype: int64


  df_q1['Height_cm'] = df_q1['Height_cm'].fillna(df_q1['Height_cm'].mode()[0])


In [None]:
#  Q2. Encode Categorical Columns
from sklearn.preprocessing import LabelEncoder
df_q2 = df_q1.copy()
le = LabelEncoder()
df_q2['IsActive'] = le.fit_transform(df_q2['IsActive'].astype(str))
df_q2 = pd.get_dummies(df_q2, columns=['Group', 'Fruit'], drop_first=True)
# Show the first few rows after encoding
print(df_q2.head())


      Score             Height_cm       Rating  IsActive  Group_B  Group_C  \
0 -1.085631             16.480034          2.0         1    False    False   
1  0.997345             16.480034  Rating: 5.0         0    False    False   
2  0.282978             49.244711          3.0         0     True    False   
3 -1.506295             16.480034          3.0         2    False    False   
4 -0.578600  31.599083317303908cm          3.0         0    False     True   

   Group_D  Fruit_banana  Fruit_grape  Fruit_nan  
0     True          True        False      False  
1    False         False        False      False  
2    False          True        False      False  
3     True         False         True      False  
4    False          True        False      False  


In [None]:
#Q3. Detect and Fix Mixed Data Types
df_q3 = df_untidy.copy()

print("Columns with object/mixed types before cleaning:")
print(df_q3.select_dtypes(include='object').columns.tolist())
df_q3['Height_cm'] = df_q3['Height_cm'].astype(str).str.replace('cm', '', regex=True)
df_q3['Height_cm'] = pd.to_numeric(df_q3['Height_cm'], errors='coerce')

df_q3['Rating'] = df_q3['Rating'].astype(str).str.replace('Rating: ', '', regex=True)
df_q3['Rating'] = pd.to_numeric(df_q3['Rating'], errors='coerce')

print("\nData types after cleaning:")
print(df_q3.dtypes)

print("\nPreview of cleaned columns:")
print(df_q3[['Height_cm', 'Rating']].head(10))


Columns with object/mixed types before cleaning:
['Height_cm', 'Rating', 'Fruit', 'Group', 'IsActive']

Data types after cleaning:
Score        float64
Height_cm    float64
Rating       float64
Fruit         object
Group         object
IsActive      object
dtype: object

Preview of cleaned columns:
   Height_cm  Rating
0        NaN     2.0
1  16.480034     5.0
2  49.244711     NaN
3        NaN     3.0
4  31.599083     NaN
5        NaN     1.0
6  45.224286     3.0
7  25.652660     3.0
8  36.253728     5.0
9        NaN     2.0


In [None]:
# ---------------- Q4. Apply Scaling & Normalization ----------------
from sklearn.preprocessing import MinMaxScaler, StandardScaler

df_q4 = df_q3.copy()
num_cols = ['Score', 'Height_cm', 'Rating']

scaler_minmax = MinMaxScaler()
df_q4[[col + "_minmax" for col in num_cols]] = scaler_minmax.fit_transform(df_q4[num_cols])

scaler_std = StandardScaler()
df_q4[[col + "_std" for col in num_cols]] = scaler_std.fit_transform(df_q4[num_cols])

print("Scaled columns added:")
print(df_q4.head()[[col + "_minmax" for col in num_cols] + [col + "_std" for col in num_cols]])


Scaled columns added:
   Score_minmax  Height_cm_minmax  Rating_minmax  Score_std  Height_cm_std  \
0      0.346613               NaN           0.25  -1.074490            NaN   
1      0.683137          0.162150           1.00   1.021067      -1.132990   
2      0.567725          0.982630            NaN   0.302385       1.692095   
3      0.278651               NaN           0.50  -1.497695            NaN   
4      0.428529          0.540755            NaN  -0.564397       0.170627   

   Rating_std  
0   -0.736795  
1    1.424470  
2         NaN  
3   -0.016373  
4         NaN  


In [None]:
# ---------------- Q5. Validation Function ----------------
def validate_dataframe(df):
    report = {}
    report['missing_values'] = df.isnull().sum().to_dict()
    report['object_columns'] = df.select_dtypes(include='object').columns.tolist()
    if 'Rating' in df.columns:
        out_of_range = df[(df['Rating'] < 1) | (df['Rating'] > 5)]
        report['rating_out_of_range_count'] = len(out_of_range)
    return report

final_report = validate_dataframe(df_q4)
print("\nValidation Report:")
print(final_report)
print("\nCleaned DataFrame Preview:")
print(df_q4.head())


Validation Report:
{'missing_values': {'Score': 60, 'Height_cm': 60, 'Rating': 60, 'Fruit': 0, 'Group': 0, 'IsActive': 0, 'Score_minmax': 60, 'Height_cm_minmax': 60, 'Rating_minmax': 60, 'Score_std': 60, 'Height_cm_std': 60, 'Rating_std': 60}, 'object_columns': ['Fruit', 'Group', 'IsActive'], 'rating_out_of_range_count': 0}

Cleaned DataFrame Preview:
      Score  Height_cm  Rating   Fruit Group IsActive  Score_minmax  \
0 -1.085631        NaN     2.0  banana     D      Yes      0.346613   
1  0.997345  16.480034     5.0   apple     A       No      0.683137   
2  0.282978  49.244711     NaN  banana     B       No      0.567725   
3 -1.506295        NaN     3.0   grape     D      nan      0.278651   
4 -0.578600  31.599083     NaN  banana     C       No      0.428529   

   Height_cm_minmax  Rating_minmax  Score_std  Height_cm_std  Rating_std  
0               NaN           0.25  -1.074490            NaN   -0.736795  
1          0.162150           1.00   1.021067      -1.132990    1.42