In [None]:
import nbtest
import json
import numpy as np
random_seed = np.random.randint(10000)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
import missingno as msno
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
import nbtest
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df = pd.read_csv("./input/train.csv")

In [None]:
train_df.head(10)

In [None]:
train_df = train_df.drop('Id', axis=1)

In [None]:
train_df.head(3)

In [None]:
train_df.shape

In [None]:
train_df.drop_duplicates(inplace=True)

In [None]:
train_df.dtypes.value_counts()

In [None]:
train_df.info()

In [None]:
numerical_features=list(train_df.select_dtypes(include=['float64','int64']).columns)
print(numerical_features)

In [None]:
categorical_features=list((train_df.select_dtypes(exclude=['float64','int64']).columns))
print(categorical_features)

In [None]:
train_df.isnull().sum()

In [None]:
train_df.describe()

In [None]:
numeric_df = train_df.select_dtypes(include=[float, int])
numeric_df.corr()

In [None]:
duplicated_columns = train_df.columns[train_df.columns.duplicated()]
if len(duplicated_columns) > 0:
    print("Duplicated columns:", duplicated_columns)
else:
    print("No duplicated columns found.")

In [None]:
duplicated_rows = train_df[train_df.duplicated()]
if not duplicated_rows.empty:
    print("Duplicated rows:")
    print(duplicated_rows)
else:
    print("No duplicated rows found.")

In [None]:
msno.matrix(train_df)

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
null_numerical_columns = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
imputer.fit(train_df[null_numerical_columns])
train_df[null_numerical_columns] = imputer.transform(train_df[null_numerical_columns])

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
null_catagorical_columns = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
imputer.fit(train_df[null_catagorical_columns])
train_df[null_catagorical_columns] = imputer.transform(train_df[null_catagorical_columns])

In [None]:
train_df.isnull().sum().sum()

In [None]:
dummies = pd.get_dummies(train_df[categorical_features], drop_first=True)
train_df = pd.concat([train_df, dummies], axis=1)
train_df.drop(categorical_features, axis=1, inplace=True)
train_df.head()

In [None]:
sns.distplot(train_df['SalePrice']);

In [None]:
plt.figure(figsize=(8, 6))
plt.bar(train_df['OverallQual'], train_df['SalePrice'], color='skyblue')  # Create the bar plot
plt.xlabel('OverallQual', fontsize=12)
plt.ylabel('SalePrice', fontsize=12)
plt.title('SalePrice vs OverallQual', fontsize=16) 
plt.show()

In [None]:
plt.figure(figsize=(12, 6))  
sns.lineplot(x="YearBuilt", y="SalePrice", data=train_df, label='SalePrice', marker='o', color='grey')
plt.title("SalePrice vs YearBuilt", fontsize=16)
plt.xlabel("YearBuilt", fontsize=12)
plt.ylabel("SalePrice", fontsize=12)
plt.xticks(range(min(train_df['YearBuilt']), max(train_df['YearBuilt']) + 1, 10))  # Adjust the interval (e.g., 10 years)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
sns.set(style="whitegrid")
ax = sns.barplot(x="GarageCars", y="SalePrice", data=train_df, color='red')
plt.title("SalePrice vs GarageCars", fontsize=16)
ax.set_xlabel("GarageCars", fontsize=12)
ax.set_ylabel("SalePrice", fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
X = train_df.drop('SalePrice', axis=1)
y = train_df['SalePrice']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
nbtest.assert_df_leakage(X_test, X_test)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
models = {
    "Multiple Linear Regression": LinearRegression(),
    "Polynomial Regression": LinearRegression(),
    "Support Vector Regression": SVR(),
    "Decision Tree Regression": DecisionTreeRegressor(),
    "Random Forest Regression": RandomForestRegressor()
}
poly_degree = 2  # Change the degree as needed
cv_scores = {}
for model_name, model in models.items():
    if model_name == "Polynomial Regression":
        poly_features = PolynomialFeatures(degree=poly_degree)
        X_train_poly = poly_features.fit_transform(X_train)
        scores = cross_val_score(model, X_train_poly, y_train, scoring="neg_mean_squared_error", cv=5)
    else:
        scores = cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
    mse_scores = -scores  # Convert negative MSE scores to positive
    cv_scores[model_name] = mse_scores.mean()
best_model_name = min(cv_scores, key=cv_scores.get)
best_model = models[best_model_name]
if best_model_name == "Polynomial Regression":
    poly_features = PolynomialFeatures(degree=poly_degree)
    X_train_poly = poly_features.fit_transform(X_train)
    best_model.fit(X_train_poly, y_train)
else:
    best_model.fit(X_train, y_train)
if best_model_name == "Polynomial Regression":
    X_test_poly = poly_features.transform(X_test)
    y_pred = best_model.predict(X_test_poly)
else:
    y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Best Model: {best_model_name}")
print(f"Mean Squared Error on Test Data: {mse}")

In [None]:
results = {
    "Model": [],
    "Cross-Validation MSE": [],
    "Test MSE": []
}
for model_name, model in models.items():
    if model_name == "Polynomial Regression":
        poly_features = PolynomialFeatures(degree=poly_degree)
        X_train_poly = poly_features.fit_transform(X_train)
        scores = cross_val_score(model, X_train_poly, y_train, scoring="neg_mean_squared_error", cv=5)
    else:
        scores = cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
    mse_scores = -scores  # Convert negative MSE scores to positive
    cv_mse_mean = mse_scores.mean()
    if model_name == "Polynomial Regression":
        poly_features = PolynomialFeatures(degree=poly_degree)
        X_train_poly = poly_features.fit_transform(X_train)
        model.fit(X_train_poly, y_train)
        X_test_poly = poly_features.transform(X_test)
        y_pred = model.predict(X_test_poly)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    test_mse = mean_squared_error(y_test, y_pred)
    results["Model"].append(model_name)
    results["Cross-Validation MSE"].append(cv_mse_mean)
    results["Test MSE"].append(test_mse)
results_df = pd.DataFrame(results)
print(results_df)

In [None]:
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
r2_score(y_test, y_pred)

In [None]:
plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred, alpha=0.5, color = 'green')
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Random Forest Regression")
plt.grid(True)
plt.show()