In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

> __Read The Data__

In [None]:
train_df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")

In [None]:
test_df = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
train_df.head()

In [None]:
train_df.drop('Id', axis=1, inplace=True)  # not needed

In [None]:
percent_missing = train_df.isnull().sum() * 100 / len(train_df)
missing_value_df = pd.DataFrame({'column_name': train_df.columns,
                                 'percent_missing': percent_missing})
missing_value_df = missing_value_df[missing_value_df.percent_missing > 0]

> __Visualize the null value__

In [None]:
plt.figure(figsize=(15, 10))
plt.barh(missing_value_df['column_name'], missing_value_df['percent_missing'], color='darkblue',)
plt.title("The Percentages Of The Columns Null Values", fontsize=15 )
plt.show()

In [None]:
# check duplicates
train_df.duplicated().sum()

> # Data Cleaning (All next Phases depends on this)

**Drop The Columns which null percentages >= 50**

In [None]:
train_df.drop(['Fence', 'MiscFeature', 'PoolQC', 'FireplaceQu', 'Alley'], axis=1, inplace=True)
train_df.shape

In [None]:
percent_missing = train_df.isnull().sum() * 100 / len(train_df)
missing_value_df = pd.DataFrame({'column_name': train_df.columns,
                                 'percent_missing': percent_missing})
missing_value_df = missing_value_df[missing_value_df.percent_missing > 0]

In [None]:
# LotFrontage (Linear feet of street connected to property)
plt.figure(figsize=(10, 8))
train_df.boxplot(["LotFrontage", "MasVnrArea", "GarageYrBlt"])
plt.title("The Box Plot For the Floating Null Columns", fontsize=15)
plt.show()

> The Data Needed to be `Re-Scaled`

In [None]:
train_df[missing_value_df.column_name.values].describe()

> As we can see: 
1. the `MasVnrArea` column has high variance and 50% of the data is equal to 0
2. the `LotFrontage` and `GarageYrBlt` columns has low std

In [None]:
# fill MasVnrArea
train_df.MasVnrArea.fillna(method='ffill', inplace=True)  # propagate last valid observation forward to next valid
train_df.MasVnrArea.fillna(method='bfill', inplace=True) # use next valid observation to fill gap

In [None]:
# fill LotFrontage, GarageYrBlt by median
train_df.LotFrontage.fillna(train_df.LotFrontage.median(), inplace=True)
train_df.GarageYrBlt.fillna(train_df.GarageYrBlt.median(), inplace=True)

In [None]:
round(train_df[missing_value_df.column_name.values].isnull().sum() / len(train_df), 3)

> The Re-main null columns are in dtype object and it's a little values

In [None]:
train_df.fillna(method='ffill', inplace=True)  # propagate last valid observation forward to next valid
train_df.fillna(method='bfill', inplace=True) # use next valid observation to fill gap

> # EDA (Let's Explore It)

In [None]:
# give a quick EDA about the Data
def plot_value_counts(columns, df):
    for column in columns:

        if len(df[column].value_counts()) >= 6 and len(df[column].value_counts()) <= 15 :  # the bar chart is sutable
            plt.figure(figsize = (10,8))
            df[column].value_counts().plot(kind='barh' ,fontsize=12, color='gold')
            plt.title(f"The Frequency of the {column} column",fontsize=15)
            plt.show()
            
        elif len(df[column].value_counts()) < 6:  # pie chart
            plt.figure(figsize = (10,8))
            df[column].value_counts().plot(kind='pie', autopct= '%1.1f%%',fontsize=12)
            plt.title(f"The ratio between vlaues for the {column} column",fontsize=15)
            plt.ylabel("")
            plt.show()

In [None]:
# plot_value_counts(train_df.columns, train_df)

In [None]:
year_df = train_df.sort_values(by='YrSold')

In [None]:
plt.figure(figsize=(15, 8))
plt.title("The Sales Time Line According To Year", fontsize=20)
sns.lineplot(data=year_df, x='YrSold', y='SalePrice', ci=0)
plt.xlabel("The Year")
plt.show()

In [None]:
plt.figure(figsize=(15, 8))
plt.title("The Sales Time Line According To Month", fontsize=20)
sns.lineplot(data=year_df, x='MoSold', y='SalePrice', ci=0)
plt.xlabel("The Month")
plt.show()

> # ML: Gonna Predict The Sales

> __The Numerical Columns needed to be `Re-Scaled`__\
__The Object dtype Columns needed to be `encoded`__

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# label encoding
for column in train_df.columns:
    if train_df[column].dtype == 'O':  # if it's not Object dtype
        le = LabelEncoder() 
        train_df[column] = le.fit_transform(train_df[column])

In [None]:
# scale_columns = []

# for column in train_df.columns:
#     if train_df[column].dtype != 'O':  # if it's not Object dtype
#         scale_columns.append(column)
#     else:
#         try:
#             le = LabelEncoder()  # label encoding
#             train_df[column] = le.fit_transform(train_df[column])
#         except:
#             continue

In [None]:
# # Normalize The Data (put them in the same scale)
# def normalize(columns, df):
#     for column in columns:
#         df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())

In [None]:
from sklearn.preprocessing import MinMaxScaler

col = train_df.columns 
scaler = MinMaxScaler()

scaled_train = scaler.fit_transform(train_df[col])

scaled_train = pd.DataFrame(scaled_train,columns=col)
scaled_train["SalePrice"] = train_df["SalePrice"]

__Start Modeling__

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = train_df.drop("SalePrice", axis=1).values
y = train_df.SalePrice.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

__XGBRegressor On The Not Scaled Data__

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [None]:
model = XGBRegressor(objective = "reg:linear", max_deepth=15, seed=100,n_estimators=100, bosster = "gblinear")

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
predY = model.predict(X_test)

In [None]:
# RMSE
np.sqrt(mean_squared_error(y_test, predY))

__XGBRegressor On The `Scaled` Data__

In [None]:
X = scaled_train.drop("SalePrice", axis=1).values
y = scaled_train.SalePrice.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = XGBRegressor(objective = "reg:linear", max_deepth=15, seed=100,n_estimators=100, bosster = "gblinear")
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

__Lasso Feature Selection__

In [None]:
from sklearn.linear_model import Lasso

In [None]:
names = train_df.drop('SalePrice', axis=1).columns
lasso = Lasso(alpha=0.1)
lasso_coef = lasso.fit(X, y).coef_

In [None]:
top_coef = lasso_coef[np.where(np.logical_or(lasso_coef > 3000, lasso_coef < -3000))]
top_names = names[np.where(np.logical_or(lasso_coef > 3000, lasso_coef < -3000))]

In [None]:
top_names

In [None]:
plt.figure(figsize=(15, 8))
plt.plot(top_names, top_coef)
plt.xticks(range(len(top_names)), top_names, rotation=60)
plt.title("Top Coefficients", fontsize=15)
plt.ylabel('Coefficients')
plt.show()

In [None]:
X_top = train_df[top_names]
y_top = train_df.SalePrice.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_top, y_top, test_size=.2, random_state=42)

In [None]:
lasso_model = XGBRegressor(objective = "reg:linear", max_deepth=15, seed=100,n_estimators=100, bosster = "gblinear")

In [None]:
lasso_model.fit(X_train, y_train)

In [None]:
lasso_model.score(X_test, y_test)

In [None]:
predY = lasso_model.predict(X_test)

In [None]:
# RMSE
np.sqrt(mean_squared_error(y_test, predY))

__Linear Regression__

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
X = train_df.drop("SalePrice", axis=1).values
y = train_df.SalePrice.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

__cross_val_score__

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
model = XGBRegressor()

In [None]:
cross_score = cross_val_score(model, X, y, cv=5)

In [None]:
cross_score

> __So The Best Accuracy We Can Get is by using Lasso Model__

In [None]:
lasso_model = XGBRegressor(objective = "reg:linear", max_deepth=15, seed=100,n_estimators=100, bosster = "gblinear")

In [None]:
lasso_model.fit(X_top, y_top)

> # Prediction

In [None]:
test_df.head()

In [None]:
test_df.drop(['Fence', 'MiscFeature', 'PoolQC', 'FireplaceQu', 'Alley'], axis=1, inplace=True)
test_df.shape

In [None]:
# fill MasVnrArea
test_df.MasVnrArea.fillna(method='ffill', inplace=True)  # propagate last valid observation forward to next valid
test_df.MasVnrArea.fillna(method='bfill', inplace=True) # use next valid observation to fill gap

In [None]:
# fill LotFrontage, GarageYrBlt by median
test_df.LotFrontage.fillna(test_df.LotFrontage.median(), inplace=True)
test_df.GarageYrBlt.fillna(test_df.GarageYrBlt.median(), inplace=True)

In [None]:
test_df.fillna(method='ffill', inplace=True)  # propagate last valid observation forward to next valid
test_df.fillna(method='bfill', inplace=True) # use next valid observation to fill gap

In [None]:
# scale_columns = []

# for column in test_df.columns:
#     if column == 'Id':
#         continue
#     if test_df[column].dtype != 'O':  # if it's not Object dtype
#         scale_columns.append(column)
#     else:
#         try:
#             le = LabelEncoder()  # label encoding
#             test_df[column] = le.fit_transform(test_df[column])
#         except:
#             continue
            

In [None]:
# normalize(scale_columns, test_df)

In [None]:
for column in test_df.columns:
    if column == 'Id':
        continue
    if test_df[column].dtype == 'O':  # if it's not Object dtype
        le = LabelEncoder()  # label encoding
        test_df[column] = le.fit_transform(test_df[column])

In [None]:
X = test_df[top_names]

In [None]:
predY = lasso_model.predict(X)

In [None]:
Id = test_df.Id.values

In [None]:
sub_df = pd.DataFrame({"Id" : Id, "SalePrice" : predY})

In [None]:
sub_df.head()

In [None]:
sub_df.to_csv('submission.csv', index=False)
print("submission successed")