# IMPORTING PACKAGE

In [None]:
import pandas as pd
import numpy as np
import random as rnd

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import norm, skew
from scipy import stats 

from sklearn.preprocessing import LabelEncoder

from scipy.special import boxcox1p

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingCVRegressor

from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

# DATA OVERVIEWS

In [None]:
train_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
sample_submission = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
sample_submission.head()

In [None]:
train_df.info()

In [None]:
column_names = train_df.columns
column_names

In [None]:
train_df.describe()

In [None]:
train_df.describe(include=['O'])

In [None]:
concatenated_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
concatenated_df.head()

In [None]:
y_train = train_df["SalePrice"]
print(y_train.head())
# Drop the 'SalePrice' column from the concatenated DataFrame
concatenated_df.drop('SalePrice', axis=1, inplace=True)

In [None]:
all_data_na = (concatenated_df.isnull().sum() / len(concatenated_df)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:20]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(10)

## List of features to be dropped due to low data point

In [None]:
# List of features to be dropped
features_to_drop = ["Id",'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 'FireplaceQu', 'LotFrontage']

# Drop the specified features from the concatenated DataFrame
concatenated_df.drop(features_to_drop, axis=1, inplace=True)

# Now, concatenated_df does not contain the specified features
concatenated_df.head()

# "LotArea" , "SaleCondition"  , "OverallCond"

# Deleted columns due to almost many data depends on single variable

In [None]:
# Box plot to compare SalePrice for Gravel (Grvl) and Paved (Pave) road access
plt.figure(figsize=(6, 6))
train_df.boxplot(column='SalePrice', by='Street', grid=False)
plt.title('SalePrice by Street Type')
plt.xlabel('Street Type')
plt.ylabel('SalePrice')
plt.xticks([1, 2], ['Gravel (Grvl)', 'Paved (Pave)'])
plt.show()
# Summary statistics for SalePrice grouped by Street type
summary_stats = train_df.groupby('Street')['SalePrice'].describe()
print(summary_stats)

In [None]:
# List of columns to be deleted due to single value
deleted_col_due_to_single_value=["Street","LandContour","Utilities","LandSlope","Condition2","RoofMatl","BsmtFinSF2",
                                 "Heating","LowQualFinSF","Functional","GarageQual","GarageCond","EnclosedPorch","3SsnPorch",
                                 "PoolArea","MiscVal","KitchenAbvGr","ScreenPorch","BsmtHalfBath","CentralAir","BsmtFinType2",
                                 "Electrical"]

# List of columns to be deleted due to not being dependent
deleted_col_due_to_not_depend =["MoSold","YrSold","SaleType"]

# Concatenated DataFrame after dropping specified columns
concatenated_df.drop(deleted_col_due_to_single_value + deleted_col_due_to_not_depend, axis=1, inplace=True)

# Now, concatenated_df does not contain the specified columns
concatenated_df.head()

In [None]:
concatenated_df.columns

In [None]:
concatenated_df.describe()

In [None]:
concatenated_df.describe(include=['O'])

# Data Changing according to need (LabelEncoder)

In [None]:
concatenated_df["YearBuilt"] = 2023 - concatenated_df["YearBuilt"]
concatenated_df["YearRemodAdd"] = 2023 - concatenated_df["YearRemodAdd"]
concatenated_df["GarageYrBlt"] = 2023 - concatenated_df["GarageYrBlt"]

In [None]:
label_encoder = LabelEncoder()

categorical_columns = concatenated_df.select_dtypes(include=['O']).columns.tolist()

for column in categorical_columns:
    concatenated_df[column] = label_encoder.fit_transform(concatenated_df[column])
    
concatenated_df.head()

In [None]:
concatenated_df.fillna(0, inplace=True)

all_data_null = (concatenated_df.isnull().sum() / len(concatenated_df)) * 100
all_data_null = all_data_null.drop(all_data_null[all_data_null == 0].index).sort_values(ascending=False)[:30]
missing_data_null = pd.DataFrame({'Missing Ratio' :all_data_null})
missing_data_null.head(20)

# Making log normal SalePrice

In [None]:
sns.distplot(y_train , fit=norm)
(mu, sigma) = norm.fit(y_train)
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

fig = plt.figure()
res = stats.probplot(y_train, plot=plt)
plt.show()

In [None]:
y_train = np.log1p(y_train)
y_train.head()

In [None]:
sns.distplot(y_train , fit=norm)
(mu, sigma) = norm.fit(y_train)
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

fig = plt.figure()
res = stats.probplot(y_train, plot=plt)
plt.show()

# Checking skewness of data points

In [None]:
numeric_feats = concatenated_df.dtypes[concatenated_df.dtypes != "object"].index
skewed_feats = concatenated_df[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head()

In [None]:
y_train_see = np.log1p(concatenated_df["LotArea"])
# y_train_see = (concatenated_df["LotArea"])

sns.distplot(y_train_see , fit=norm)
(mu, sigma) = norm.fit(y_train)
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],loc='best')
plt.ylabel('Frequency')
plt.title('distribution')

fig = plt.figure()
res = stats.probplot(y_train_see, plot=plt)
plt.show()

In [None]:
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    #all_data[feat] += 1
    concatenated_df[feat] = boxcox1p(concatenated_df[feat], lam)
    
concatenated_df[skewed_features] = np.log1p(concatenated_df[skewed_features])
concatenated_df

# Different Regression Model

In [None]:
# Get the index where the train and test data were originally separated
train_data_index = len(train_df)
test_data_index = len(concatenated_df) - len(test_df)

# Split the concatenated data back into train and test sets
X_train = concatenated_df[:train_data_index]
X_test = concatenated_df[test_data_index:]

# Now, train_data contains the training data, and test_data contains the test data


In [None]:
X_test

In [None]:
X_train

In [None]:
y_train

In [None]:
# Initialize the models
linear_reg = LinearRegression()
decision_tree_reg = DecisionTreeRegressor(random_state=0)
random_forest_reg = RandomForestRegressor(n_estimators=100, random_state=0)
gradient_boosting_reg = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
svr_reg = SVR(kernel='rbf')
xgb_reg = XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       )

stack_gen = StackingCVRegressor(regressors=(linear_reg, decision_tree_reg, random_forest_reg, gradient_boosting_reg, svr_reg,lightgbm,xgb_reg),
                                meta_regressor=xgb_reg,
                                use_features_in_secondary=True)

In [None]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X=X_train, y=y_train):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kfolds))
    return rmse

### From here [coefficients] values comes

In [None]:
# BY RELATIVE ERROR OF MEAN AND STD


# score = cv_rmse(linear_reg)
# print("Ridge: {:.4f} ({:.4f})\n".format(score.mean(), score.std()),  )

# score = cv_rmse(decision_tree_reg)
# print("decision_tree_reg: {:.4f} ({:.4f})\n".format(score.mean(), score.std()),  )

# score = cv_rmse(random_forest_reg)
# print("random_forest_reg net: {:.4f} ({:.4f})\n".format(score.mean(), score.std()),  )

# score = cv_rmse(gradient_boosting_reg)
# print("gradient_boosting_reg: {:.4f} ({:.4f})\n".format(score.mean(), score.std()),  )

# score = cv_rmse(svr_reg)
# print("svr_reg: {:.4f} ({:.4f})\n".format(score.mean(), score.std()),  )

# score = cv_rmse(xgb_reg)
# print("xgb_reg: {:.4f} ({:.4f})\n".format(score.mean(), score.std()),  )

# score = cv_rmse(lightgbm)
# print("lightgbm: {:.4f} ({:.4f})\n".format(score.mean(), score.std()),  )

# score = cv_rmse(stack_gen)
# print("stack_gen: {:.4f} ({:.4f})\n".format(score.mean(), score.std()),  )

# from here [coefficients] values comes

In [None]:
# Train the models
linear_reg.fit(X_train, y_train)
decision_tree_reg.fit(X_train, y_train)
random_forest_reg.fit(X_train, y_train)
gradient_boosting_reg.fit(X_train, y_train)
svr_reg.fit(X_train, y_train)
xgb_reg.fit(X_train, y_train)
lightgbm.fit(X_train, y_train)
stack_gen.fit(X_train, y_train)

In [None]:
X_test.fillna(0, inplace=True)
all_data_na = (X_test.isnull().sum())
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:20]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(10)

In [None]:
# Make predictions using the trained models
linear_reg_preds = linear_reg.predict(X_test)
decision_tree_preds = decision_tree_reg.predict(X_test)
random_forest_preds = random_forest_reg.predict(X_test)
gradient_boosting_preds = gradient_boosting_reg.predict(X_test)
svr_preds = svr_reg.predict(X_test)
xgb_preds = xgb_reg.predict(X_test)
lightgbm_preds = lightgbm.predict(X_test)
stack_gen_preds = stack_gen.predict(X_test)

In [None]:
# Create a DataFrame with predictions
predictions_df = pd.DataFrame({
    'LinearReg': linear_reg_preds,
    'DecisionTree': decision_tree_preds,
    'RandomForest': random_forest_preds,
    'GradientBoosting': gradient_boosting_preds,
    'SVR': svr_preds,
    'XGBoost': xgb_preds,
    'Lightgbm': lightgbm_preds,
    'StackGen': stack_gen_preds
})


In [None]:
coefficients={
    'LinearReg': 0.30802729485992525,
    'RandomForest': 0.02367726228440147,
    'GradientBoosting': 0.19984912388986045,
    'SVR': 0.027966944415869426,
    'XGBoost': 0.20027774920275693,
    'Lightgbm': 0.09594349003874773,
    'StackGen': 0.14425813530843876
}

In [None]:
# Calculate the weighted average predictions from all models
predictions_df['WeightedSalePrice'] = sum(predictions_df[model] * coefficients[model] for model in coefficients.keys())

# Get the 'Id' column from your test dataset
test_ids = sample_submission['Id']

predictions_df['WeightedSalePrice'] = np.expm1(predictions_df['WeightedSalePrice'])
# predictions_df['WeightedSalePrice'] = np.expm1(X_test)

# Create a DataFrame with 'Id' and 'WeightedSalePrice' columns
submission_df_2 = pd.DataFrame({'Id': test_ids, 'SalePrice': predictions_df['WeightedSalePrice']})
# Save the submission DataFrame to a CSV file
submission_df_2.to_csv('submission.csv', index=False)

submission_df_2.head()

# Score: 0.12061 