# Importing the libraries

In [None]:
# Essentials
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Miscs
from sklearn.impute import SimpleImputer
import missingno as msno
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures

# Warning
import warnings
warnings.filterwarnings('ignore')

# Importing the dataset

In [None]:
train_df = pd.read_csv("./input/train.csv")

In [None]:
train_df.head(10)

Data Preparation

In [None]:
train_df = train_df.drop('Id', axis=1)

In [None]:
train_df.head(3)

# Exploratory Data Analysis

Data Shape

In [None]:
train_df.shape

Checking for Duplicates

In [None]:
train_df.drop_duplicates(inplace=True)

Checking data types

In [None]:
train_df.dtypes.value_counts()

Data info

In [None]:
train_df.info()

Numerical Features

In [None]:
numerical_features=list(train_df.select_dtypes(include=['float64','int64']).columns)
print(numerical_features)

Categorical Features

In [None]:
categorical_features=list((train_df.select_dtypes(exclude=['float64','int64']).columns))
print(categorical_features)

Checking for missing values

In [None]:
train_df.isnull().sum()

Statistics of our dataset

In [None]:
train_df.describe()

Correlation Matrix

In [None]:
# Select only the numeric columns from the DataFrame
numeric_df = train_df.select_dtypes(include=[float, int])

# Correlation matrix
numeric_df.corr()

# Data Cleaning

Data Redundancy

columns

In [None]:
duplicated_columns = train_df.columns[train_df.columns.duplicated()]
if len(duplicated_columns) > 0:
    print("Duplicated columns:", duplicated_columns)
else:
    print("No duplicated columns found.")

rows

In [None]:
duplicated_rows = train_df[train_df.duplicated()]
if not duplicated_rows.empty:
    print("Duplicated rows:")
    print(duplicated_rows)
else:
    print("No duplicated rows found.")

Missing Values

In [None]:
msno.matrix(train_df)

Mean for missing numerical features

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
null_numerical_columns = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
imputer.fit(train_df[null_numerical_columns])
train_df[null_numerical_columns] = imputer.transform(train_df[null_numerical_columns])

Most frequent for missing categorical features

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
null_catagorical_columns = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
imputer.fit(train_df[null_catagorical_columns])
train_df[null_catagorical_columns] = imputer.transform(train_df[null_catagorical_columns])

In [None]:
train_df.isnull().sum().sum()

One Hot Encoder

In [None]:
# Encode the selected columns
dummies = pd.get_dummies(train_df[categorical_features], drop_first=True)

# Concatenate the encoded columns with the original dataset
train_df = pd.concat([train_df, dummies], axis=1)

# Drop the original columns that have been encoded
train_df.drop(categorical_features, axis=1, inplace=True)

# Output the modified dataset
train_df.head()

# Plotting relationship between variables

SalePrice's Density

In [None]:
sns.distplot(train_df['SalePrice']);

SalePrice vs OverallQual

In [None]:
plt.figure(figsize=(8, 6))
plt.bar(train_df['OverallQual'], train_df['SalePrice'], color='skyblue')  # Create the bar plot
plt.xlabel('OverallQual', fontsize=12)
plt.ylabel('SalePrice', fontsize=12)
plt.title('SalePrice vs OverallQual', fontsize=16) 


plt.show()

SalePrice vs YearBuilt

In [None]:
plt.figure(figsize=(12, 6))  

sns.lineplot(x="YearBuilt", y="SalePrice", data=train_df, label='SalePrice', marker='o', color='grey')

plt.title("SalePrice vs YearBuilt", fontsize=16)
plt.xlabel("YearBuilt", fontsize=12)
plt.ylabel("SalePrice", fontsize=12)

# Customize the x-axis ticks to show more years
plt.xticks(range(min(train_df['YearBuilt']), max(train_df['YearBuilt']) + 1, 10))  # Adjust the interval (e.g., 10 years)

plt.legend()
plt.tight_layout()
plt.show()

SalePrice vs GarageCars

In [None]:
sns.set(style="whitegrid")
ax = sns.barplot(x="GarageCars", y="SalePrice", data=train_df, color='red')
plt.title("SalePrice vs GarageCars", fontsize=16)
ax.set_xlabel("GarageCars", fontsize=12)
ax.set_ylabel("SalePrice", fontsize=12)
plt.tight_layout()
plt.show()

# Machine Learning

Pre-processing Steps for Machine Learning


In [None]:
X = train_df.drop('SalePrice', axis=1)
y = train_df['SalePrice']

Spliting the Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Feature Scaling

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Model Selection

In [None]:
# Initialize models
models = {
    "Multiple Linear Regression": LinearRegression(),
    "Polynomial Regression": LinearRegression(),
    "Support Vector Regression": SVR(),
    "Decision Tree Regression": DecisionTreeRegressor(),
    "Random Forest Regression": RandomForestRegressor()
}

# Define parameters for polynomial regression
poly_degree = 2  # Change the degree as needed

# Create a dictionary to store the cross-validation scores for each model
cv_scores = {}

for model_name, model in models.items():
    if model_name == "Polynomial Regression":
        poly_features = PolynomialFeatures(degree=poly_degree)
        X_train_poly = poly_features.fit_transform(X_train)
        scores = cross_val_score(model, X_train_poly, y_train, scoring="neg_mean_squared_error", cv=5)
    else:
        scores = cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
    
    mse_scores = -scores  # Convert negative MSE scores to positive
    cv_scores[model_name] = mse_scores.mean()

# Find the best model based on the lowest MSE
best_model_name = min(cv_scores, key=cv_scores.get)
best_model = models[best_model_name]

# Train the best model on the full training dataset
if best_model_name == "Polynomial Regression":
    poly_features = PolynomialFeatures(degree=poly_degree)
    X_train_poly = poly_features.fit_transform(X_train)
    best_model.fit(X_train_poly, y_train)
else:
    best_model.fit(X_train, y_train)

# Evaluate the best model on the test dataset
if best_model_name == "Polynomial Regression":
    X_test_poly = poly_features.transform(X_test)
    y_pred = best_model.predict(X_test_poly)
else:
    y_pred = best_model.predict(X_test)

# Calculate Mean Squared Error on the test dataset
mse = mean_squared_error(y_test, y_pred)

print(f"Best Model: {best_model_name}")
print(f"Mean Squared Error on Test Data: {mse}")

Correlation

In [None]:
# Create a dictionary to store the cross-validation scores and test MSE for each model
results = {
    "Model": [],
    "Cross-Validation MSE": [],
    "Test MSE": []
}

for model_name, model in models.items():
    # Cross-validation
    if model_name == "Polynomial Regression":
        poly_features = PolynomialFeatures(degree=poly_degree)
        X_train_poly = poly_features.fit_transform(X_train)
        scores = cross_val_score(model, X_train_poly, y_train, scoring="neg_mean_squared_error", cv=5)
    else:
        scores = cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
    
    mse_scores = -scores  # Convert negative MSE scores to positive
    cv_mse_mean = mse_scores.mean()
    
    # Train and evaluate on the test data
    if model_name == "Polynomial Regression":
        poly_features = PolynomialFeatures(degree=poly_degree)
        X_train_poly = poly_features.fit_transform(X_train)
        model.fit(X_train_poly, y_train)
        X_test_poly = poly_features.transform(X_test)
        y_pred = model.predict(X_test_poly)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    test_mse = mean_squared_error(y_test, y_pred)
    
    # Store the results in the dictionary
    results["Model"].append(model_name)
    results["Cross-Validation MSE"].append(cv_mse_mean)
    results["Test MSE"].append(test_mse)

# Create a DataFrame from the results dictionary
results_df = pd.DataFrame(results)

# Display the results table
print(results_df)

Training Random Forest Regresssion

In [None]:
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train, y_train)

Predicting the Test set results

In [None]:
y_pred = regressor.predict(X_test)

Visualising the Random Forest Regression

In [None]:
plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred, alpha=0.5, color = 'green')
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Random Forest Regression")
plt.grid(True)
plt.show()

Evaluating the model performance

In [None]:
r2_score(y_test, y_pred)