In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load data
train_df = pd.read_csv('F:/MLpractrice/ts/data/train.csv')
test_df = pd.read_csv('F:/MLpractrice/ts/data/test.csv')

# Fill missing values and preprocess the data
def preprocess_data(df, label_encoders=None, is_train=True):
    # Fill missing values
    df['CryoSleep'].fillna(df['CryoSleep'].mode()[0], inplace=True)
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['HomePlanet'].fillna(df['HomePlanet'].mode()[0], inplace=True)
    
    # Check if 'Cabin' column exists before filling
    if 'Cabin' in df.columns:
        df['Cabin'].fillna('Unknown/0/Unknown', inplace=True)
        df[['Deck', 'CabinNum', 'Side']] = df['Cabin'].str.split('/', expand=True)
        df['CabinNum'] = df['CabinNum'].astype(int)
        df.drop('Cabin', axis=1, inplace=True)
    else:
        df['Deck'] = 'Unknown'
        df['CabinNum'] = 0
        df['Side'] = 'Unknown'
    
    df['Destination'].fillna(df['Destination'].mode()[0], inplace=True)
    df['VIP'].fillna(df['VIP'].mode()[0], inplace=True)
    df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0)

    # Encode categorical features
    if label_encoders is None:
        label_encoders = {}
        for column in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']:
            label_encoders[column] = LabelEncoder()
            df[column] = label_encoders[column].fit_transform(df[column])
    else:
        for column in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']:
            df[column] = label_encoders[column].transform(df[column])
    
    # Drop non-numeric and unnecessary columns
    df.drop(['Name'], axis=1, inplace=True)
    
    return df, label_encoders

# Preprocess training data
train_df, label_encoders = preprocess_data(train_df)

# Extract features and target
X = train_df.drop(['Transported', 'PassengerId'], axis=1)
y = train_df['Transported']

# Feature scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))

# Preprocess the test data
test_df, _ = preprocess_data(test_df, label_encoders, is_train=False)

# Drop PassengerId for prediction
X_test = test_df.drop('PassengerId', axis=1)

# Feature scaling
X_test = scaler.transform(X_test)

# Make predictions
test_predictions = model.predict(X_test)

# Prepare submission file
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': test_predictions
})
submission_df.to_csv('F:/MLpractrice/ts/data/submission.csv', index=False)


Validation Accuracy: 0.7981598619896493
