In [83]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer




#### Data Cleaning

In [84]:
df= pd.read_csv('train.csv', low_memory=False)
test_data = pd.read_csv('test.csv', low_memory=False)

In [None]:
location_cols = ['neighbourhood_group_cleansed', 'neighbourhood_cleansed',  'state']
property_cols= ['property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities_list']
host_cols= ['host_is_superhost', 'host_listings_count', 'host_identity_verified']
booking_cols= ['minimum_nights', 'maximum_nights', 'extra_people', 'number_of_reviews', 'review_scores_rating', 'cancellation_policy']
    

# Optimized feature selection
numeric_cols = [
    # Core property features
    'accommodates', 'bathrooms', 'bedrooms', 'beds',
    # Review scores (expanded)
    'review_scores_rating', 
    'review_scores_location',
    # Booking constraints
    'minimum_nights', 'extra_people'
]

categorical_cols = [
    # Location features
    'neighbourhood_group_cleansed', 'neighbourhood_cleansed',
    # Property characteristics
    'property_type', 'room_type',
    # Booking features
    'cancellation_policy', 'instant_bookable',
    # Host quality indicator
    'host_is_superhost'
]

In [89]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from category_encoders import TargetEncoder
from category_encoders.count import CountEncoder  

#Encode categorical columns
one_hot_cols= ['room_type', 'neighbourhood_group_cleansed', 'cancellation_policy']
frequent_cols= ['neighbourhood_cleansed']
target_cols= ['property_type']
binary_cols= ['host_is_superhost', 'instant_bookable']


# 1. One Hot Encoding
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
one_hot_encoded = onehot_encoder.fit_transform(final_data[one_hot_cols])
one_hot_df = pd.DataFrame(
    one_hot_encoded, 
    columns=onehot_encoder.get_feature_names_out(one_hot_cols)
)

# 2. Frequency (Count) Encoding
count_encoder = CountEncoder()  # Changed from freq_encoder to count_encoder
frequent_encoded = count_encoder.fit_transform(final_data[frequent_cols])
frequent_df = pd.DataFrame(
    frequent_encoded,
    columns=frequent_cols
)

# 3. Target Encoding
target_encoder = TargetEncoder()
target_encoded = target_encoder.fit_transform(
    final_data[target_cols], 
    final_data['price']  # assuming price is your target variable
)
target_df = pd.DataFrame(
    target_encoded,
    columns=target_cols
)

# 4. Binary Encoding (using Label Encoder since these are binary)
binary_df = pd.DataFrame()
for col in binary_cols:
    le = LabelEncoder()
    binary_df[col] = le.fit_transform(final_data[col])

# Combine all encoded features
encoded_data = pd.concat([
    one_hot_df,
    frequent_df,
    target_df,
    binary_df
], axis=1)

final_data= final_data.drop(columns= categorical_cols)
final_data= pd.concat([final_data, encoded_data], axis=1)

In [90]:
def check_feature_importance(data, numeric_cols, categorical_cols):
    """
    Check feature importance before final encoding with proper column name handling
    
    Parameters:
    -----------
    data : pandas DataFrame
        Preprocessed data before final encoding
    numeric_cols : list
        List of numeric column names
    categorical_cols : list
        List of categorical column names
    """
    import pandas as pd
    from sklearn.ensemble import RandomForestRegressor
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # Create copy to avoid modifying original data
    X = data.copy()
    
    # Simple encoding for analysis only
    encoded_dfs = []
    for col in categorical_cols:
        if col in X.columns:
            X_encoded = pd.get_dummies(X[col], prefix=col, prefix_sep='_')
            # Ensure all column names are strings
            X_encoded.columns = X_encoded.columns.astype(str)
            encoded_dfs.append(X_encoded)
    
    # Get numeric features
    X_numeric = X[numeric_cols].copy()
    X_numeric.columns = X_numeric.columns.astype(str)
    
    # Combine all features
    all_features = [X_numeric] + encoded_dfs
    X_combined = pd.concat(all_features, axis=1)
    y = data['price']
    
    # Train RandomForest
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_combined, y)
    
    # Create importance DataFrame
    importance_df = pd.DataFrame({
        'feature': X_combined.columns,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Plotting
    plt.figure(figsize=(12, 6))
    sns.barplot(x='importance', y='feature', data=importance_df.head(20))
    plt.title('Top 20 Most Important Features')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    return importance_df

In [None]:
# 1. Initial Setup and Column Definition (already defined from training)

# 2. Process Test Data
test_cleaned = test_data.drop(columns=['host_acceptance_rate', 'square_feet'])

# 3. Process Amenities
test_cleaned['amenities_list'] = test_cleaned['amenities'].apply(parse_amenities)
# Use transform instead of fit_transform to maintain consistent encoding
amenities_encoded_test = mlb.transform(test_cleaned['amenities_list'])
amenities_df_test = pd.DataFrame(amenities_encoded_test, columns=mlb.classes_)

test_cleaned['extra_people'] = test_cleaned['extra_people'].str.replace('$', '', regex=False)
test_cleaned['extra_people'] = test_cleaned['extra_people'].str.replace(',', '', regex=False)
test_cleaned['extra_people'] = pd.to_numeric(test_cleaned['extra_people'], errors='coerce')



# 4. Impute Numeric Columns (using training data statistics)
for col in numeric_cols:
    # Use training data's min and max values
    min_val = cleaned[col].min()
    max_val = cleaned[col].max()
    imputer = IterativeImputer(max_iter=10, random_state=0, min_value=min_val, max_value=max_val)
    test_cleaned[col] = imputer.fit_transform(test_cleaned[[col]])

# 5. Impute Categorical Columns (using training data mode)
for col in categorical_cols:
    test_cleaned[col] = test_cleaned[col].fillna(cleaned[col].mode()[0])

# 6. Prepare Final Test Data
test_final_data = test_cleaned[['id'] + numeric_cols + categorical_cols]
test_final_data = pd.concat([test_final_data, amenities_df_test], axis=1)

# 7. Apply Encoders (using transform only)
# One Hot Encoding
test_one_hot = onehot_encoder.transform(test_final_data[one_hot_cols])
test_one_hot_df = pd.DataFrame(
    test_one_hot, 
    columns=onehot_encoder.get_feature_names_out(one_hot_cols)
)

# Frequency Encoding
test_frequent = count_encoder.transform(test_final_data[frequent_cols])
test_frequent_df = pd.DataFrame(test_frequent, columns=frequent_cols)

# Target Encoding
test_target = target_encoder.transform(test_final_data[target_cols])
test_target_df = pd.DataFrame(test_target, columns=target_cols)

# Binary Encoding
test_binary_df = pd.DataFrame()
for col in binary_cols:
    test_binary_df[col] = le.transform(test_final_data[col])

# 1. Store ID separately before encoding
test_ids = test_final_data['id']
test_final_data = test_final_data.drop('id', axis=1)

# 2. Combine encoded features (without id)
test_encoded_data = pd.concat([
    test_one_hot_df,
    test_frequent_df,
    test_target_df,
    test_binary_df
], axis=1)

# 3. Create final test dataset (without id)
test_final_data = test_final_data.drop(columns=categorical_cols)
test_final_data = pd.concat([test_final_data, test_encoded_data], axis=1)




In [93]:
#XG Boost, Grid Search
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import pandas as pd
import numpy as np
# 1. Check and convert data types
X = final_data.drop(['price', 'id'], axis=1)
y = final_data['price']

# Convert target variable
y = y.astype(float)

# Fill any NaN values created during conversion
X = X.fillna(X.mean())

# Now proceed with GridSearchCV
params = {
    'max_depth': [6, 8],
    'learning_rate': [0.01, 0.05],
    'n_estimators': [200, 300],
    'min_child_weight': [3],
    'subsample': [0.8],
    'colsample_bytree': [0.8]
}

xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    tree_method='hist'
)

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=params,
    cv=5,
    scoring='neg_root_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X, y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [94]:
# Get the best model from grid search
best_model = grid_search.best_estimator_

# Process test data the same way as training data
test_ids = test_data['id']
X_test = test_final_data.copy()

# Apply same preprocessing to test data
for col in X_test.columns:
    if str(X_test[col].dtypes) == 'object':
        X_test[col] = pd.to_numeric(X_test[col], errors='coerce')
    else:
        X_test[col] = X_test[col].astype(float)

# Fill NaN values using training data mean
X_test = X_test.fillna(X.mean())

# Make predictions using the best model
predictions = best_model.predict(X_test)

# Create submission DataFrame
submissions = pd.DataFrame({
    'Id': test_ids,
    'Predicted': predictions.clip(0, None)  # Ensure no negative prices
})

# Save predictions
submissions.to_csv('submission5.csv', index=False)
