In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

# Load the training data
train_data = pd.read_csv('/content/train.csv')

# Load the testing data (without SalePrice column)
test_data = pd.read_csv('/content/test.csv')

# Check for missing values in training and testing data
print("Training Data Missing Values:")
print(train_data.isnull().sum())
print("\nTesting Data Missing Values:")
print(test_data.isnull().sum())

# Drop rows with missing values in both training and testing data
train_data_cleaned = train_data.dropna()
test_data_cleaned = test_data.dropna()

# Check if there are any rows left after dropping missing values
print(f"Number of rows in cleaned training data: {train_data_cleaned.shape[0]}")
print(f"Number of rows in cleaned testing data: {test_data_cleaned.shape[0]}")

# If no rows are left, switch to imputation strategy
if train_data_cleaned.shape[0] == 0 or test_data_cleaned.shape[0] == 0:
    from sklearn.impute import SimpleImputer
    print("Too many missing values, switching to imputation.")

    # Define features and target variable for training data
    X_train = train_data[['FullBath', 'HalfBath', 'BedroomAbvGr', 'TotalBsmtSF', '2ndFlrSF', '1stFlrSF']]
    y_train = train_data['SalePrice']

    # Define features for testing data
    X_test = test_data[['FullBath', 'HalfBath', 'BedroomAbvGr', 'TotalBsmtSF', '2ndFlrSF', '1stFlrSF']]

    # Impute missing values using the mean strategy
    imputer = SimpleImputer(strategy='mean')
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)

    # Create the model
    model = LinearRegression()

    # Train the model
    model.fit(X_train_imputed, y_train)

    # Make predictions on the test set
    predicted_prices = model.predict(X_test_imputed)
else:
    # Proceed with the cleaned data
    print("Proceeding with dropped missing values.")

    # Define features and target variable for training data
    X_train = train_data_cleaned[['FullBath', 'HalfBath', 'BedroomAbvGr', 'TotalBsmtSF', '2ndFlrSF', '1stFlrSF']]
    y_train = train_data_cleaned['SalePrice']

    # Define features for testing data
    X_test = test_data_cleaned[['FullBath', 'HalfBath', 'BedroomAbvGr', 'TotalBsmtSF', '2ndFlrSF', '1stFlrSF']]

    # Create the model
    model = LinearRegression()

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    predicted_prices = model.predict(X_test)

# Print only the predicted prices
for price in predicted_prices:
    print(price)

Training Data Missing Values:
Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

Testing Data Missing Values:
Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64
Number of rows in cleaned training data: 0
Number of rows in cleaned testing data: 0
Too many missing values, switching to imputation.
117384.35455934342
178359.93628163365
201906.0047924385
199990.06874240402
199557.9960082103
192862.40737672927
166678.49705234767
181022.68579172165
196757.5191455189
116242.79780676952
211189.09158145552
117115.36636412416
108342.4478192856
184740.01921689784
109922.10348035746
296314.13106