In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [5]:
# Load and preprocess data
tr_data = pd.read_csv('train.csv')
ts_data = pd.read_csv('test.csv')

In [6]:
# Identify numerical and categorical columns
numerical_columns = tr_data.drop(['id', 'class'], axis=1).select_dtypes(exclude=['object']).columns
categorical_columns = tr_data.drop(['id', 'class'], axis=1).select_dtypes(include=['object']).columns

# Fill missing values
tr_data[categorical_columns] = tr_data[categorical_columns].fillna('missing')
ts_data[categorical_columns] = ts_data[categorical_columns].fillna('missing')

tr_data[numerical_columns] = tr_data[numerical_columns].fillna(tr_data[numerical_columns].median())
ts_data[numerical_columns] = ts_data[numerical_columns].fillna(ts_data[numerical_columns].median())

# Remove duplicates
tr_data = tr_data.drop_duplicates()



In [7]:
cap_shape = ['x', 'f', 's', 'b', 'o', 'p', 'c', 'd', 'e',
             'n', 'w', 't', 'g', 'y', 'r', 'a', 'u', 'z', 'l', 'i', 'k', 'h', 'missing']

def map_cap_shape(value):
    return value if value in cap_shape else 'unknown'

tr_data['cap-shape'].str.strip()
tr_data['cap-shape'] = tr_data['cap-shape'].str.replace('is s', 's')
tr_data['cap-shape'] = tr_data['cap-shape'].str.replace('is p', 'p')
tr_data['cap-shape'] = tr_data['cap-shape'].apply(map_cap_shape)
ts_data['cap-shape'] = ts_data['cap-shape'].apply(map_cap_shape)


In [8]:
cap_surface = ['t', 's', 'y', 'h', 'f', 'k', 'e', 'c', 'z', 'g', 'w', 'n', 'd', 'b', 'i', 'u', 'a', 'p', 'o', 'x', 'm', 'l', 'r', 'missing']
def map_cap_surface(value):
    return value if value in cap_surface else 'unknown'

# Cap Color
cap_color = ['u', 'o', 'b', 'g', 'w', 'n', 'e', 'y', 'r', 'p', 'k', 'l', 'i', 'h', 'd', 's', 'a', 'f', 'c', 'x', 'm', 'z', 't', 'missing']
def map_cap_color(value):
    return value if value in cap_color else 'unknown'

# Does Bruise or Bleed
does_bruise_or_bleed = ['f', 't', 'w', 'c', 'h', 'y', 'a', 'b', 'x', 's', 'k', 'p', 'e', 'l', 'd', 'g', 'o', 'z', 'n', 'i', 'r', 'u', 'missing']
def map_does_bruise_or_bleed(value):
    return value if value in does_bruise_or_bleed else 'unknown'

# Gill Attachment
gill_attachment = ['a', 'x', 's', 'd', 'e', 'f', 'p', 'l', 'm', 'b', 'n', 'g', 'i', 'u', 't', 'o', 'c', 'w', 'k', 'r', 'h', 'z', 'y', 'missing']
def map_gill_attachment(value):
    return value if value in gill_attachment else 'unknown'

# Gill Spacing
gill_spacing = ['c', 'd', 'f', 'x', 'b', 'a', 'k', 'e', 'y', 's', 'p', 't', 'i', 'w', 'h', 'l', 'r', 'n', 'g', 'missing']
def map_gill_spacing(value):
    return value if value in gill_spacing else 'unknown'

# Gill Color
gill_color = ['w', 'n', 'g', 'k', 'y', 'f', 'p', 'o', 'b', 'u', 'e', 'r', 'd', 't', 'z', 'h', 'x', 's', 'c', 'm', 'l', 'a', 'i', 'missing']
def map_gill_color(value):
    return value if value in gill_color else 'unknown'

# Stem Root
stem_root = ['b', 'c', 'r', 's', 'f', 'y', 'o', 'k', 'd', 'n', 'w', 'u', 'p', 'x', 'i', 'a', 't', 'm', 'l', 'h', 'g', 'e', 'z', 'missing']
def map_stem_root(value):
    return value if value in stem_root else 'unknown'

# Stem Surface
stem_surface = ['y', 's', 't', 'g', 'h', 'k', 'i', 'f', 'l', 'd', 'x', 'w', 'a', 'o', 'c', 'n', 'm', 'e', 'p', 'z', 'b', 'r', 'u', 'missing']
def map_stem_surface(value):
    return value if value in stem_surface else 'unknown'

# Stem Color
stem_color = ['w', 'o', 'n', 'y', 'e', 'u', 'p', 'f', 'g', 'r', 'k', 'l', 'b', 't', 'z', 'a', 'h', 'd', 's', 'i', 'c', 'x', 'm', 'missing']
def map_stem_color(value):
    return value if value in stem_color else 'unknown'

# Veil Type
veil_type = ['u', 'd', 'a', 'h', 'g', 'c', 'e', 'y', 'i', 'f', 't', 'w', 'p', 'b', 's', 'k', 'r', 'l', 'n', 'missing']
def map_veil_type(value):
    return value if value in veil_type else 'unknown'

# Veil Color
veil_color = ['n', 'w', 'k', 'y', 'e', 'u', 'p', 'd', 'g', 'r', 'h', 's', 't', 'c', 'o', 'i', 'f', 'a', 'b', 'l', 'z', 'missing']
def map_veil_color(value):
    return value if value in veil_color else 'unknown'

# Has Ring
has_ring = ['f', 't', 'h', 'r', 'y', 'c', 'e', 'g', 'l', 's', 'p', 'x', 'k', 'z', 'd', 'o', 'n', 'm', 'i', 'w', 'a', 'missing']
def map_has_ring(value):
    return value if value in has_ring else 'unknown'

# Ring Type
ring_type = ['f', 'z', 'e', 'p', 'l', 'g', 'r', 'm', 'y', 'h', 'o', 't', 'a', 'd', 's', 'x', 'b', 'u', 'n', 'w', 'i', 'k', 'c', 'missing']
def map_ring_type(value):
    return value if value in ring_type else 'unknown'

# Spore Print Color
spore_print_color = ['k', 'w', 'p', 'n', 'r', 'u', 'g', 't', 'f', 'd', 'l', 'y', 'a', 's', 'e', 'o', 'c', 'b', 'h', 'x', 'i', 'm', 'missing']
def map_spore_print_color(value):
    return value if value in spore_print_color else 'unknown'

# Habitat
habitat = ['d', 'l', 'g', 'h', 'p', 'm', 'u', 'w', 'y', 'n', 'a', 's', 'k', 'z', 'b', 't', 'c', 'e', 'r', 'f', 'o', 'i', 'x', 'missing']
def map_habitat(value):
    return value if value in habitat else 'unknown'

# Season
season = ['a', 'w', 'u', 's']
def map_season(value):
    return value if value in season else 'unknown'

# Apply the mapping functions to both training and test data
for col, map_func in [
    ('cap-surface', map_cap_surface),
    ('cap-color', map_cap_color),
    ('does-bruise-or-bleed', map_does_bruise_or_bleed),
    ('gill-attachment', map_gill_attachment),
    ('gill-spacing', map_gill_spacing),
    ('gill-color', map_gill_color),
    ('stem-root', map_stem_root),
    ('stem-surface', map_stem_surface),
    ('stem-color', map_stem_color),
    ('veil-type', map_veil_type),
    ('veil-color', map_veil_color),
    ('has-ring', map_has_ring),
    ('ring-type', map_ring_type),
    ('spore-print-color', map_spore_print_color),
    ('habitat', map_habitat),
    ('season', map_season)
]:
    if col in tr_data.columns:
        tr_data[col] = tr_data[col].str.strip().str.lower().apply(map_func)
    if col in ts_data.columns:
        ts_data[col] = ts_data[col].str.strip().str.lower().apply(map_func)

In [9]:
encoders = {}
for col in categorical_columns:
    encoders[col] = LabelEncoder()
    tr_data[col] = encoders[col].fit_transform(tr_data[col])
    ts_data[col] = encoders[col].transform(ts_data[col])


In [14]:
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Prepare the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore',  sparse_output=False), categorical_columns)
    ])

# Fit the preprocessor and transform the data
X_train = preprocessor.fit_transform(tr_data.drop(['id', 'class'], axis=1))
X_test = preprocessor.transform(ts_data.drop(['id'], axis=1))

# Ensure we're working with dense arrays
if sparse.issparse(X_train):
    X_train = X_train.toarray()
if sparse.issparse(X_test):
    X_test = X_test.toarray()

In [16]:
# Encode the target variable
le = LabelEncoder()
y_train = le.fit_transform(tr_data['class'])

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.LongTensor(y_train)
X_test_tensor = torch.FloatTensor(X_test)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [17]:
# Define the neural network
class MushroomClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MushroomClassifier, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, hidden_dim)
        self.layer3 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.dropout(x)
        x = self.relu(self.layer2(x))
        x = self.dropout(x)
        x = self.layer3(x)
        return x

# Initialize the model
input_dim = X_train.shape[1]
hidden_dim = 128
output_dim = len(np.unique(y_train))
model = MushroomClassifier(input_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [18]:
# Train the model
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Make predictions
model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor)
    _, predicted = torch.max(y_pred, 1)

predicted_labels  = le.inverse_transform(predicted.numpy())

Epoch [10/50], Loss: 0.0026
Epoch [20/50], Loss: 0.0031
Epoch [30/50], Loss: 0.3341
Epoch [40/50], Loss: 0.0057
Epoch [50/50], Loss: 0.1053


In [19]:
submit = pd.DataFrame({
    'id': ts_data['id'],
    'class': predicted_labels
})

submit.to_csv("submission.csv", index=False)
print("Predictions have been added to the test data and saved as 'submission.csv'")

Predictions have been added to the test data and saved as 'submission.csv'


In [20]:
X_train_subset, X_val, y_train_subset, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
X_val_tensor = torch.FloatTensor(X_val)
y_val_tensor = torch.LongTensor(y_val)

model.eval()
with torch.no_grad():
    y_val_pred = model(X_val_tensor)
    _, predicted = torch.max(y_val_pred, 1)

accuracy = accuracy_score(y_val, predicted)
print(f"\nModel Validation Accuracy: {accuracy}")
print("\nClassification Report:")
print(classification_report(y_val, predicted, target_names=le.classes_))


Model Validation Accuracy: 0.9918654323383954

Classification Report:
              precision    recall  f1-score   support

           e       0.99      0.99      0.99    282397
           p       0.99      0.99      0.99    340992

    accuracy                           0.99    623389
   macro avg       0.99      0.99      0.99    623389
weighted avg       0.99      0.99      0.99    623389

