<a href="https://colab.research.google.com/github/rohit-mhatre/CS-535-DM/blob/main/dataMining_Assignment2_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

problem 1

In [None]:
import pandas as pd


In [None]:
# Load data from adult.data and adult.test files
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]

In [None]:
train_data = pd.read_csv("adult.data", names=columns, na_values=[" ?", "?"])
test_data = pd.read_csv("adult.test", names=columns, na_values=[" ?", "?"])

In [None]:
# Merge the datasets for cleaning
combined_data = pd.concat([train_data, test_data], ignore_index=True)

In [None]:
# Handle missing values
# You can handle missing values using techniques like dropping rows, imputation, etc.
# For simplicity, let's drop rows with missing values in this example
combined_data.dropna(inplace=True)

In [None]:
# Preprocess the data (e.g., convert categorical variables to numerical)
# One-hot encoding for categorical variables
combined_data = pd.get_dummies(combined_data, columns=[
    "workclass", "education", "marital-status", "occupation",
    "relationship", "race", "sex", "native-country"
])

In [None]:
# Convert income column to binary labels (0 for <=50K, 1 for >50K)
combined_data["income"] = combined_data["income"].apply(lambda x: 0 if x == " <=50K" else 1)

In [None]:
# Split data back into train and test sets
train_cleaned = combined_data.iloc[:train_data.shape[0]]
test_cleaned = combined_data.iloc[train_data.shape[0]:]

In [None]:
# Save cleaned data to new files if needed
train_cleaned.to_csv("cleaned_adult_train.csv", index=False)
test_cleaned.to_csv("cleaned_adult_test.csv", index=False)

In [None]:
print("Data cleaning and preprocessing completed.")


Data cleaning and preprocessing completed.


problem 2:

In [17]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [18]:
# Load cleaned training and evaluation datasets
train_data = pd.read_csv("cleaned_adult_train.csv")
eval_data = pd.read_csv("cleaned_adult_test.csv")

In [19]:
# Separate features and labels
X_train = train_data.drop(columns=["income"])
y_train = train_data["income"]
X_eval = eval_data.drop(columns=["income"])
y_eval = eval_data["income"]

In [20]:
# Initialize and train Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

In [21]:
# Predict on evaluation dataset
y_pred = rf_classifier.predict(X_eval)

In [22]:
# Calculate classification error rate
classification_error_rate = 1 - accuracy_score(y_eval, y_pred)


In [23]:
# Print classification error rate
print("Classification Error Rate:", classification_error_rate)

Classification Error Rate: 0.7628939262301556


problem 4:

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

In [2]:
# Load cleaned evaluation dataset
eval_data = pd.read_csv("cleaned_adult_test.csv")
X_eval = eval_data.drop(columns=["income"])
y_eval = eval_data["income"]

In [3]:
# Define sampling rates
sampling_rates = [50, 60, 70, 80, 90]

In [4]:
# Initialize lists to store error rates
error_rates = []

In [5]:
# Repeat the process 5 times for each sampling rate
for rate in sampling_rates:
    error_rates_for_rate = []
    for _ in range(5):  # Repeat 5 times
        # Load cleaned training dataset
        train_data = pd.read_csv("cleaned_adult_train.csv")

        # Separate samples by income class
        low_income_samples = train_data[train_data['income'] == 0].sample(frac=(rate/100))
        high_income_samples = train_data[train_data['income'] == 1].sample(frac=(rate/100))

        # Concatenate samples and shuffle
        downsampled_train_data = pd.concat([low_income_samples, high_income_samples]).sample(frac=1)

        # Separate features and labels
        X_train = downsampled_train_data.drop(columns=["income"])
        y_train = downsampled_train_data["income"]

        # Initialize and train Random Forest classifier
        rf_classifier = RandomForestClassifier(n_estimators=100, random_state=47)
        rf_classifier.fit(X_train, y_train)

        # Predict on evaluation dataset
        y_pred = rf_classifier.predict(X_eval)

        # Calculate error rate
        error_rate = 1 - accuracy_score(y_eval, y_pred)
        error_rates_for_rate.append(error_rate)

    # Store mean and standard deviation for each rate
    error_rates.append((rate, np.mean(error_rates_for_rate), np.std(error_rates_for_rate)))

In [6]:
# Print results
print("Sampling Rate\tMean Error Rate\tStandard Deviation")
for rate, mean, std_dev in error_rates:
    print(f"{rate}%\t\t{mean:.6f}\t\t{std_dev:.6f}")

Sampling Rate	Mean Error Rate	Standard Deviation
50%		0.767096		0.002165
60%		0.767301		0.001447
70%		0.765105		0.001416
80%		0.765848		0.002814
90%		0.764600		0.001410


question 3:

In [32]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load cleaned training and evaluation datasets
train_data = pd.read_csv("cleaned_adult_train.csv")
eval_data = pd.read_csv("cleaned_adult_test.csv")

# Separate features and labels
X_train = train_data.drop(columns=["income"])
y_train = train_data["income"]
X_eval = eval_data.drop(columns=["income"])
y_eval = eval_data["income"]

# Define numerical and categorical features
numerical_features = X_train.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns

# Define preprocessing steps for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # Scale numerical features
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Combine preprocessing steps for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Initialize and train Random Forest classifier with feature engineering
rf_classifier = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

rf_classifier.fit(X_train, y_train)

# Predict on evaluation dataset
y_pred = rf_classifier.predict(X_eval)

# Calculate classification error rate
classification_error_rate = 1 - accuracy_score(y_eval, y_pred)

# Print classification error rate
print("Classification Error Rate with Feature Engineering:", classification_error_rate)


Classification Error Rate with Feature Engineering: 0.7624200300134271


Question 5:

In [33]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Load cleaned training and evaluation datasets
train_data = pd.read_csv("cleaned_adult_train.csv")
eval_data = pd.read_csv("cleaned_adult_test.csv")

# Separate features and labels
X_train = train_data.drop(columns=["income"])
y_train = train_data["income"]
X_eval = eval_data.drop(columns=["income"])
y_eval = eval_data["income"]

# Initialize and train XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)
xgb_classifier.fit(X_train, y_train)

# Predict on evaluation dataset
y_pred = xgb_classifier.predict(X_eval)

# Calculate classification error rate
classification_error_rate = 1 - accuracy_score(y_eval, y_pred)

# Print classification error rate
print("Classification Error Rate with XGBoost:", classification_error_rate)


Classification Error Rate with XGBoost: 0.7716610062396335
