In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
# Load datasets
train_df = pd.read_csv('/kaggle/input/thapar-summer-school-2024-competition-2/train.csv/train.csv')
test_df = pd.read_csv('/kaggle/input/thapar-summer-school-2024-competition-2/test.csv/test.csv')
sample_submission_df = pd.read_csv('/kaggle/input/thapar-summer-school-2024-competition-2/sample_submission (1).csv')

# Display the first few rows of the datasets
print("Train Dataset:")
print(train_df.head())
print("\nTest Dataset:")
print(test_df.head())
print("\nSample Submission:")
print(sample_submission_df.head())

In [None]:
# Handling missing values
# Fill missing values in numeric columns with mean
numeric_columns = train_df.select_dtypes(include=['number']).columns
train_df[numeric_columns] = train_df[numeric_columns].fillna(train_df[numeric_columns].mean())
test_df[numeric_columns] = test_df[numeric_columns].fillna(test_df[numeric_columns].mean())

# Fill missing values in categorical columns with mode
categorical_columns = train_df.select_dtypes(include=['object']).columns
categorical_columns = categorical_columns.drop('NObeyesdad')
train_df[categorical_columns] = train_df[categorical_columns].fillna(train_df[categorical_columns].mode().iloc[0])
test_df[categorical_columns] = test_df[categorical_columns].fillna(test_df[categorical_columns].mode().iloc[0])

# Combine train and test data for consistent label encoding
combined_df = pd.concat([train_df[categorical_columns], test_df[categorical_columns]])

# Encoding categorical variables
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    combined_df[col] = le.fit_transform(combined_df[col])
    train_df[col] = combined_df[:len(train_df)][col]
    test_df[col] = combined_df[len(train_df):][col]
    label_encoders[col] = le

# Encode the target variable
target_le = LabelEncoder()
train_df['NObeyesdad'] = target_le.fit_transform(train_df['NObeyesdad'])
label_encoders['NObeyesdad'] = target_le

# Splitting features and target variable from training data
X_train = train_df.drop(columns=['NObeyesdad'])
y_train = train_df['NObeyesdad']

# Split the data into training and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

# Scale the data
scaler = StandardScaler()
X_train_split = scaler.fit_transform(X_train_split)
X_val = scaler.transform(X_val)
X_train = scaler.fit_transform(X_train)
test_df = scaler.transform(test_df)

# Initialize the models
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
rf_model = RandomForestClassifier(random_state=42)
lgb_model = LGBMClassifier(random_state=42,verbosity=-1)

# Create a voting classifier
voting_clf = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('rf', rf_model),
        ('lgb', lgb_model)
    ],
    voting='soft'
)

# Train the voting classifier
voting_clf.fit(X_train_split, y_train_split)

# Validate the model
y_val_pred = voting_clf.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {accuracy:.4f}')

# Train the voting classifier on the entire training set
voting_clf.fit(X_train, y_train)

# Make predictions on the test set
test_predictions = voting_clf.predict(test_df)

# Map predictions back to original labels
test_predictions_labels = label_encoders['NObeyesdad'].inverse_transform(test_predictions)

# Prepare the submission file
submission_df = pd.DataFrame({
    'id': sample_submission_df['id'],
    'NObeyesdad': test_predictions_labels
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Submission file created successfully.")
