In [2]:
import pandas as pd
import numpy as np

In [31]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df.drop(train_df.columns[0], axis=1, inplace=True)
train_df.set_index('id', inplace=True)
train_df.head()

Unnamed: 0_level_0,name,neighborhood_overview,host_id,host_name,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,reviews_per_month,monthly_revenue
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19792418,Home in Vancouver · ★4.75 · 1 bedroom · 1 bed ...,Everything you need is nearby. <br /><br />Hig...,57488206,Jessi,,,,f,3,3,...,4.8,4.82,4.9,4.87,4.69,4.81,f,3,0.77,2108
1015650685503221866,Guest suite in Vancouver · ★New · 2 bedrooms ·...,,139792573,Daniel,within a few hours,100%,100%,f,1,4,...,,,,,,,f,1,,2730
35265562,Guest suite in Vancouver · ★4.85 · 2 bedrooms ...,Beautiful neighbourhood close to prosperous Ma...,265504225,Alex,within an hour,100%,98%,t,1,1,...,4.9,4.78,4.97,4.94,4.9,4.75,f,1,3.22,2254
911948980885194155,Home in Vancouver · ★5.0 · 1 bedroom · 1 bed ·...,We are located in a quiet residential neighbor...,22595056,Raymond,,,92%,t,1,1,...,5.0,5.0,5.0,5.0,4.86,5.0,f,1,1.28,3187
46069251,Guest suite in Vancouver · ★4.93 · 1 bedroom ·...,Kitsilano at it's best! Short walk to all the ...,65683877,Yendi,within an hour,100%,95%,t,2,3,...,4.93,4.89,4.97,4.97,4.96,4.85,f,1,2.01,3479


In [32]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5352 entries, 19792418 to 19298482
Data columns (total 43 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   name                            5352 non-null   object 
 1   neighborhood_overview           3476 non-null   object 
 2   host_id                         5352 non-null   int64  
 3   host_name                       5352 non-null   object 
 4   host_response_time              4263 non-null   object 
 5   host_response_rate              4263 non-null   object 
 6   host_acceptance_rate            4620 non-null   object 
 7   host_is_superhost               5315 non-null   object 
 8   host_listings_count             5352 non-null   int64  
 9   host_total_listings_count       5352 non-null   int64  
 10  neighbourhood                   3476 non-null   object 
 11  neighbourhood_cleansed          5352 non-null   object 
 12  latitude                    

In [None]:
missing_values = train_df.isnull().sum()
print(missing_values[missing_values > 0].sort_values(ascending=False))  # Columns with missing data

bathrooms                      5352
bedrooms                       5352
neighborhood_overview          1876
neighbourhood                  1876
host_response_rate             1089
host_response_time             1089
reviews_per_month               906
review_scores_checkin           901
review_scores_location          901
review_scores_value             901
review_scores_rating            900
review_scores_accuracy          900
review_scores_cleanliness       900
review_scores_communication     900
host_acceptance_rate            732
price                           665
host_is_superhost                37
beds                             36
dtype: int64


In [36]:
train_df.shape

(5352, 43)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Load and prepare the data
def load_data(train_path, test_path=None):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path) if test_path else None
    return train_df, test_df

def basic_eda(df):
    print("\nBasic Dataset Info:")
    print(f"Shape: {df.shape}")
    print("\nMissing Values:")
    print(df.isnull().sum()[df.isnull().sum() > 0])
    
    # Numeric columns analysis
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    print("\nNumeric Columns Statistics:")
    print(df[numeric_cols].describe())
    
    # Create visualizations
    plt.figure(figsize=(12, 6))
    sns.histplot(data=df, x='monthly_revenue', bins=50)
    plt.title('Distribution of Monthly Revenue')
    plt.show()
    
    # Correlation matrix for numeric columns
    plt.figure(figsize=(15, 10))
    sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix')
    plt.show()

def feature_engineering(df, is_training=True):
    # Create copy to avoid modifying original dataframe
    df_processed = df.copy()
    
    # Handle missing values
    df_processed['neighborhood_overview'] = df_processed['neighborhood_overview'].fillna('')
    df_processed['host_response_time'] = df_processed['host_response_time'].fillna('not_specified')
    df_processed['host_response_rate'] = df_processed['host_response_rate'].fillna('0%')
    df_processed['host_acceptance_rate'] = df_processed['host_acceptance_rate'].fillna('0%')
    
    # Convert percentage strings to float values
    df_processed['host_response_rate'] = df_processed['host_response_rate'].str.rstrip('%').astype(float) / 100
    df_processed['host_acceptance_rate'] = df_processed['host_acceptance_rate'].str.rstrip('%').astype(float) / 100
    
    # Convert boolean columns
    df_processed['host_is_superhost'] = df_processed['host_is_superhost'].map({'t': 1, 'f': 0})
    df_processed['host_identity_verified'] = df_processed['host_identity_verified'].map({'t': 1, 'f': 0})
    
    # Extract number of amenities
    df_processed['amenities_count'] = df_processed['amenities'].str.len()
    
    # Price processing
    df_processed['price'] = df_processed['price'].str.replace('$', '').str.replace(',', '').astype(float)
    
    # Create features from review scores
    review_cols = [col for col in df_processed.columns if col.startswith('review_scores_')]
    df_processed[review_cols] = df_processed[review_cols].fillna(df_processed[review_cols].mean())
    
    # Calculate average review score
    df_processed['avg_review_score'] = df_processed[review_cols].mean(axis=1)
    
    # Create categorical encodings
    categorical_cols = ['room_type', 'property_type', 'neighbourhood_cleansed']
    label_encoders = {}
    
    for col in categorical_cols:
        le = LabelEncoder()
        df_processed[f'{col}_encoded'] = le.fit_transform(df_processed[col])
        if is_training:
            label_encoders[col] = le
    
    # Location features
    df_processed['distance_to_center'] = np.sqrt(
        (df_processed['latitude'] - df_processed['latitude'].mean())**2 +
        (df_processed['longitude'] - df_processed['longitude'].mean())**2
    )
    
    return df_processed, label_encoders if is_training else df_processed

def prepare_features(df):
    # Select features for modeling
    feature_cols = [
        'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
        'host_listings_count', 'host_identity_verified', 'accommodates',
        'beds', 'price', 'minimum_nights', 'maximum_nights', 'availability_365',
        'number_of_reviews', 'reviews_per_month', 'avg_review_score',
        'amenities_count', 'distance_to_center', 'room_type_encoded',
        'property_type_encoded', 'neighbourhood_cleansed_encoded'
    ]
    
    return df[feature_cols]

def train_model(X, y):
    # Initialize models
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
    
    # Train and evaluate models
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    models = {
        'Random Forest': rf_model,
        'Gradient Boosting': gb_model
    }
    
    best_model = None
    best_score = float('-inf')
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        val_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, val_pred))
        r2 = r2_score(y_val, val_pred)
        
        print(f"\n{name} Results:")
        print(f"RMSE: {rmse:.2f}")
        print(f"R2 Score: {r2:.4f}")
        
        if r2 > best_score:
            best_score = r2
            best_model = model
    
    # Feature importance for best model
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
    plt.title('Top 10 Most Important Features')
    plt.show()
    
    return best_model

def make_predictions(model, X_test):
    return model.predict(X_test)

# Main execution flow
def main():
    # Load data
    train_df, test_df = load_data('train.csv', 'test.csv')
    
    # Perform EDA
    basic_eda(train_df)
    
    # Feature engineering
    train_processed, label_encoders = feature_engineering(train_df, is_training=True)
    if test_df is not None:
        test_processed = feature_engineering(test_df, is_training=False)
    
    # Prepare features
    X = prepare_features(train_processed)
    y = train_processed['monthly_revenue']
    
    # Train model
    best_model = train_model(X, y)
    
    # Make predictions if test data is available
    if test_df is not None:
        X_test = prepare_features(test_processed)
        predictions = make_predictions(best_model, X_test)
        
        # Create submission file
        submission = pd.DataFrame({
            'id': test_df['id'],
            'monthly_revenue': predictions
        })
        submission.to_csv('submission.csv', index=False)

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'sklearn'