In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats

In [53]:
filename = "/content/housing.csv"
df = pd.read_csv(filename)

print("Dataset Information:")
print(df.info())

print("\nStatistical Summary of Numerical Columns:")
print(df.describe())

if "Ocean Proximity" in df.columns:
    print("\nUnique Value Counts for 'Ocean Proximity':")
    print(df["Ocean Proximity"].value_counts())
else:
    print("\n'Ocean Proximity' column not found in the dataset.")

missing_values = df.isnull().sum()
missing_columns = missing_values[missing_values > 0]

if not missing_columns.empty:
    print("\nColumns with Missing Values:")
    print(missing_columns)
else:
    print("\nNo missing values found in the dataset.")


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           3000 non-null   float64
 1   latitude            3000 non-null   float64
 2   housing_median_age  3000 non-null   float64
 3   total_rooms         3000 non-null   float64
 4   total_bedrooms      3000 non-null   float64
 5   population          3000 non-null   float64
 6   households          3000 non-null   float64
 7   median_income       3000 non-null   float64
 8   median_house_value  3000 non-null   float64
dtypes: float64(9)
memory usage: 211.1 KB
None

Statistical Summary of Numerical Columns:
         longitude    latitude  housing_median_age   total_rooms  \
count  3000.000000  3000.00000         3000.000000   3000.000000   
mean   -119.589200    35.63539           28.845333   2599.578667   
std       1.994936     2.12967          

In [74]:
'''Write Python code to implement the following data preprocessing
techniques for Adult income data set'''

def createdata():
  data = {
      'Age': np.random.randint(18, 70, size=20),
      'Salary': np.random.randint(30000, 120000, size=20),
      'Purchased': np.random.choice([0, 1], size=20),
      'Gender': np.random.choice(['Male', 'Female'], size=20),
      'City': np.random.choice(['New York', 'San Francisco', 'Los Angeles'], size=20)
  }

  df = pd.DataFrame(data)
  return df

In [75]:
df = createdata()
df.head(10)

Unnamed: 0,Age,Salary,Purchased,Gender,City
0,21,98335,0,Female,San Francisco
1,29,107193,1,Female,San Francisco
2,40,65298,0,Male,Los Angeles
3,67,110619,0,Female,New York
4,35,35539,1,Female,Los Angeles
5,52,119090,1,Male,New York
6,40,87266,0,Male,San Francisco
7,20,118367,1,Male,Los Angeles
8,48,102695,0,Female,San Francisco
9,26,66367,0,Male,San Francisco


In [76]:
# Introduce some missing values for demonstration
df.loc[5, 'Age'] = np.nan
df.loc[10, 'Salary'] = np.nan
df.head(10)

Unnamed: 0,Age,Salary,Purchased,Gender,City
0,21.0,98335.0,0,Female,San Francisco
1,29.0,107193.0,1,Female,San Francisco
2,40.0,65298.0,0,Male,Los Angeles
3,67.0,110619.0,0,Female,New York
4,35.0,35539.0,1,Female,Los Angeles
5,,119090.0,1,Male,New York
6,40.0,87266.0,0,Male,San Francisco
7,20.0,118367.0,1,Male,Los Angeles
8,48.0,102695.0,0,Female,San Francisco
9,26.0,66367.0,0,Male,San Francisco


In [77]:
#Data Cleaning
# Check for missing values in each column
missing_values = df.isnull().sum()

# Display columns with missing values
print(missing_values[missing_values > 0])

Age       1
Salary    1
dtype: int64


In [78]:
#Set the values to some value (zero, the mean, the median, etc.).
# Step 1: Create an instance of SimpleImputer with the median strategy for Age and mean stratergy for Salary
imputer1 = SimpleImputer(strategy="median")
imputer2 = SimpleImputer(strategy="mean")

df_copy=df

# Step 2: Fit the imputer on the "Age" and "Salary"column
# Note: SimpleImputer expects a 2D array, so we reshape the column
imputer1.fit(df_copy[["Age"]])
imputer2.fit(df_copy[["Salary"]])

# Step 3: Transform (fill) the missing values in the "Age" and "Salary"c column
df_copy["Age"] = imputer1.transform(df[["Age"]])
df_copy["Salary"] = imputer2.transform(df[["Salary"]])

# Verify that there are no missing values left
print(df_copy["Age"].isnull().sum())
print(df_copy["Salary"].isnull().sum())

0
0


In [79]:
#Handling Categorical Attributes
#Using Ordinal Encoding for gender COlumn and One-Hot Encoding for City Column

# Initialize OrdinalEncoder
ordinal_encoder = OrdinalEncoder(categories=[["Male", "Female"]])
# Fit and transform the data
df_copy["Gender_Encoded"] = ordinal_encoder.fit_transform(df_copy[["Gender"]])

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder()

# Fit and transform the "City" column
encoded_data = onehot_encoder.fit_transform(df[["City"]])

# Convert the sparse matrix to a dense array
encoded_array = encoded_data.toarray()

# Convert to DataFrame for better visualization
encoded_df = pd.DataFrame(encoded_array, columns=onehot_encoder.get_feature_names_out(["City"]))
df_encoded = pd.concat([df_copy, encoded_df], axis=1)

df_encoded.drop("Gender", axis=1, inplace=True)
df_encoded.drop("City", axis=1, inplace=True)

print(df_encoded. head())

    Age    Salary  Purchased  Gender_Encoded  City_Los Angeles  City_New York  \
0  21.0   98335.0          0             1.0               0.0            0.0   
1  29.0  107193.0          1             1.0               0.0            0.0   
2  40.0   65298.0          0             0.0               1.0            0.0   
3  67.0  110619.0          0             1.0               0.0            1.0   
4  35.0   35539.0          1             1.0               1.0            0.0   

   City_San Francisco  
0                 1.0  
1                 1.0  
2                 0.0  
3                 0.0  
4                 0.0  


In [80]:
#Removing Outliers
# Outlier Detection and Treatment using IQR
#Pros: Simple and effective for mild outliers.
#Cons: May overly reduce variation if there are many extreme outliers.
df_encoded_copy1=df_encoded
df_encoded_copy2=df_encoded
df_encoded_copy3=df_encoded

Q1 = df_encoded_copy1['Salary'].quantile(0.25)
Q3 = df_encoded_copy1['Salary'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_encoded_copy1['Salary'] = np.where(df_encoded_copy1['Salary'] > upper_bound, upper_bound,
                        np.where(df_encoded_copy1['Salary'] < lower_bound, lower_bound, df_encoded_copy1['Salary']))
df_encoded_copy1.head()

Unnamed: 0,Age,Salary,Purchased,Gender_Encoded,City_Los Angeles,City_New York,City_San Francisco
0,21.0,98335.0,0,1.0,0.0,0.0,1.0
1,29.0,107193.0,1,1.0,0.0,0.0,1.0
2,40.0,65298.0,0,0.0,1.0,0.0,0.0
3,67.0,110619.0,0,1.0,0.0,1.0,0.0
4,35.0,35539.0,1,1.0,1.0,0.0,0.0


In [81]:
#Data Transformation
# Min-Max Scaler/Normalization (range 0-1)
#Pros: Keeps all data between 0 and 1; ideal for distance-based models.
#Cons: Can distort data distribution, especially with extreme outliers.
normalizer = MinMaxScaler()
df_encoded[['Salary']] = normalizer.fit_transform(df_encoded[['Salary']])
df_encoded.head()

Unnamed: 0,Age,Salary,Purchased,Gender_Encoded,City_Los Angeles,City_New York,City_San Francisco
0,21.0,0.745689,0,1.0,0.0,0.0,1.0
1,29.0,0.850876,1,1.0,0.0,0.0,1.0
2,40.0,0.353382,0,0.0,1.0,0.0,0.0
3,67.0,0.891559,0,1.0,0.0,1.0,0.0
4,35.0,0.0,1,1.0,1.0,0.0,0.0


In [69]:
# Standardization (mean=0, variance=1)
#Pros: Works well for normally distributed data; suitable for many models.
#Cons: Sensitive to outliers.
scaler = StandardScaler()
df_encoded[['Age']] = scaler.fit_transform(df_encoded[['Age']])
df_encoded.head()


Unnamed: 0,Age,Glucose,BMI,Gender_Encoded,Outcome_0,Outcome_1
0,1.982577,93.0,0.991819,1.0,1.0,0.0
1,-0.1103,195.0,0.435823,0.0,0.0,1.0
2,0.229085,87.0,0.264387,1.0,0.0,1.0
3,-0.279993,113.0,0.78517,1.0,0.0,1.0
4,-1.015328,163.0,0.144165,0.0,0.0,1.0


In [59]:
'''Write Python code to implement the following data preprocessing
techniques for Diabetes data set'''

def create_diabetes_data():
    data = {
        'Age': np.random.randint(20, 80, size=20),
        'Glucose': np.random.randint(50, 200, size=20),
        'BMI': np.random.uniform(18.5, 45.0, size=20),
        'Gender': np.random.choice(['Male', 'Female'], size=20),
        'Outcome': np.random.choice([0, 1], size=20)
    }

    df = pd.DataFrame(data)

    # Introduce some missing values for demonstration
    df.loc[5, 'Glucose'] = np.nan
    df.loc[10, 'BMI'] = np.nan

    return df

# Load dataset
df = create_diabetes_data()
print("Original Data:\n", df.head())

Original Data:
    Age  Glucose        BMI  Gender  Outcome
0   77     93.0  44.640289  Female        0
1   40    195.0  30.589381    Male        1
2   46     87.0  26.256915  Female        1
3   37    113.0  39.417936  Female        1
4   24    163.0  23.218723    Male        1


In [60]:
# Data Cleaning - Handling Missing Values
imputer_glucose = SimpleImputer(strategy="median")
imputer_bmi = SimpleImputer(strategy="mean")

df_copy = df.copy()
df_copy["Glucose"] = imputer_glucose.fit_transform(df_copy[["Glucose"]])
df_copy["BMI"] = imputer_bmi.fit_transform(df_copy[["BMI"]])

print(df_copy["Glucose"].isnull().sum())
print(df_copy["BMI"].isnull().sum())


0
0


In [61]:
# Handling Categorical Attributes
# Ordinal Encoding for Gender
ordinal_encoder = OrdinalEncoder(categories=[["Male", "Female"]])
df_copy["Gender_Encoded"] = ordinal_encoder.fit_transform(df_copy[["Gender"]])

# One-Hot Encoding for Outcome column
onehot_encoder = OneHotEncoder()
encoded_outcome = onehot_encoder.fit_transform(df_copy[["Outcome"]]).toarray()
encoded_df = pd.DataFrame(encoded_outcome, columns=onehot_encoder.get_feature_names_out(["Outcome"]))

df_encoded = pd.concat([df_copy, encoded_df], axis=1)
df_encoded.drop(["Gender", "Outcome"], axis=1, inplace=True)

print("\nAfter Encoding:\n", df_encoded.head())


After Encoding:
    Age  Glucose        BMI  Gender_Encoded  Outcome_0  Outcome_1
0   77     93.0  44.640289             1.0        1.0        0.0
1   40    195.0  30.589381             0.0        0.0        1.0
2   46     87.0  26.256915             1.0        0.0        1.0
3   37    113.0  39.417936             1.0        0.0        1.0
4   24    163.0  23.218723             0.0        0.0        1.0


In [62]:
# Removing Outliers using IQR (for Glucose)
Q1 = df_encoded["Glucose"].quantile(0.25)
Q3 = df_encoded["Glucose"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_encoded["Glucose"] = np.where(df_encoded["Glucose"] > upper_bound, upper_bound,
                                 np.where(df_encoded["Glucose"] < lower_bound, lower_bound, df_encoded["Glucose"]))
df_encoded.head()

Unnamed: 0,Age,Glucose,BMI,Gender_Encoded,Outcome_0,Outcome_1
0,77,93.0,44.640289,1.0,1.0,0.0
1,40,195.0,30.589381,0.0,0.0,1.0
2,46,87.0,26.256915,1.0,0.0,1.0
3,37,113.0,39.417936,1.0,0.0,1.0
4,24,163.0,23.218723,0.0,0.0,1.0


In [63]:
# Data Transformation
# Min-Max Scaling for BMI
normalizer = MinMaxScaler()
df_encoded[['BMI']] = normalizer.fit_transform(df_encoded[['BMI']])
df_encoded.head()

Unnamed: 0,Age,Glucose,BMI,Gender_Encoded,Outcome_0,Outcome_1
0,77,93.0,0.991819,1.0,1.0,0.0
1,40,195.0,0.435823,0.0,0.0,1.0
2,46,87.0,0.264387,1.0,0.0,1.0
3,37,113.0,0.78517,1.0,0.0,1.0
4,24,163.0,0.144165,0.0,0.0,1.0


In [64]:
# Standardization for Age
scaler = StandardScaler()
df_encoded[['Age']] = scaler.fit_transform(df_encoded[['Age']])

print("\nFinal Preprocessed Data:\n", df_encoded.head())


Final Preprocessed Data:
         Age  Glucose       BMI  Gender_Encoded  Outcome_0  Outcome_1
0  1.982577     93.0  0.991819             1.0        1.0        0.0
1 -0.110300    195.0  0.435823             0.0        0.0        1.0
2  0.229085     87.0  0.264387             1.0        0.0        1.0
3 -0.279993    113.0  0.785170             1.0        0.0        1.0
4 -1.015328    163.0  0.144165             0.0        0.0        1.0
