In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sys
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import LabelEncoder
import warnings

print('Python: {}'.format(sys.version))
print('numpy: {}'.format(np.__version__))
print('pandas: {}'.format(pd.__version__))
print('matplotlib: {}'.format(matplotlib.__version__))
print('seaborn: {}'.format(sns.__version__))
print('sklearn: {}'.format(sklearn.__version__))


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
print("\n\n============== [ DataFrames ] ==================\n\n")
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def warns(*args,**kwargs): pass
warnings.warn=warns

# **Load Dataset**

In [None]:
df_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
print("Train Shape: ", df_train.shape, "\nTest Shape: ", df_test.shape)

In [None]:
df_train.head()

### **Sale Price Visualization**

In [None]:
sns.distplot(df_train['SalePrice'])
sns.displot(df_train['SalePrice'])

In [None]:
maxPrice = df_train['SalePrice'].max()
print("Maximum Sale Price: ", maxPrice)

In [None]:
data = pd.concat([df_train['SalePrice'], df_train['OverallQual']], axis = 1)
f, ax = plt.subplots(figsize=(14, 8))
fig = sns.boxplot(x = df_train['OverallQual'], y='SalePrice', data=data)
fig.axis(ymin = 0, ymax = maxPrice)

### **Missing & Null Data Handling**

In [None]:
# Function for Checking Null Values in given DataFrames
def get_missing_values(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum() / df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return (missing_data)    

In [None]:
# Check Train Missing Values
train_missing_values = get_missing_values(df_train)
train_missing_values.head(20)

In [None]:
# Check Test Missing Values
test_missing_values = get_missing_values(df_test)
test_missing_values.head(20)

#### **Visualizing the Missing/Null Vlues**

In [None]:
# Visualizing the Train DataFerame
fig, ax = plt.subplots(figsize=(12,6)) # figsize in inches
plt.title("Missing Value in Train DataFrame", fontsize = 18)
sns.heatmap(df_train.isnull())

In [None]:
# Visualizing the Test DataFerame
fig, ax = plt.subplots(figsize=(12,6)) # figsize in inches
plt.title("Missing Value in Test DataFrame", fontsize = 18)
sns.heatmap(df_test.isnull())

### **Filling Null/Missing Data and Encoding the Categorical Values**

In [None]:
# Function for Fill Null Values and
# Encoding of Categorical Values

le = LabelEncoder()

dtypeVal = ["object", "int64", "float64" ]
def fillna_encoder(df):
    for x in dtypeVal:
        if x == "object":
            obj_df = df.select_dtypes(include = [x]).copy().reset_index()
            obj_df.fillna("Not Listed", inplace = True)
            obj_df = obj_df.astype(str)
            obj_df = obj_df.apply(le.fit_transform)
            
        elif x == "int64":
            int_df = df.select_dtypes(include = [x]).copy().reset_index()
            int_df.fillna(0, inplace = True)
            
        elif x == "float64":
            float_df = df.select_dtypes(include = [x]).copy().reset_index()
            float_df.fillna(0, inplace = True)
    
    # Merging of all dataframe
    all_df = obj_df.merge(int_df, on='index').merge(float_df, on='index')
    return(all_df)

### **Train Data**

In [None]:
# Filling Null and Encoding Train Categorical Values
xtrain = fillna_encoder(df_train)
xtrain

### **Test Data**

In [None]:
# Filling Null and Encoding Train Categorical Values
xtest = fillna_encoder(df_test)
xtest

In [None]:
# Verify the Null/Missing Values by Visualizing the Train DataFerame
fig, ax = plt.subplots(figsize=(12,6)) # figsize in inches
plt.title("Missing Value in Train DataFrame", fontsize = 18)
sns.heatmap(xtrain.isnull())

In [None]:
# Verify the Null/Missing Values by Visualizing the Test DataFerame
fig, ax = plt.subplots(figsize=(12,6)) # figsize in inches
plt.title("Missing Value in Test DataFrame", fontsize = 18)
sns.heatmap(xtest.isnull())

In [None]:
# Verifying the Null/Missing Values in Train and Test DataFrame
print("Train DataFrame Null Values: ", xtrain.isnull().sum().sum(), "\nTest DataFrame Null Values: ", xtest.isnull().sum().sum() )

### **Define X, y and X_test**

In [None]:
# Define X, y and X_test
id = xtest.Id
y = xtrain['SalePrice'].values
X = xtrain.drop(['Id', 'index', 'SalePrice'], axis=1)

# For test dataframe Prediction
X_test = xtest.drop(['Id', 'index'], axis=1)

### **Spliting for Training and Validation**

In [None]:
from sklearn.model_selection import train_test_split

# Split into Validation and training data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=1)

X_train.shape, X_val.shape, y_train.shape, y_val.shape, X_test.shape

### **Scaler Tranformation**

In [None]:
# Scaler Tranformation
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Define Scaler
scaler = StandardScaler().fit(X_train)

# Scale the Train Set
X_train = scaler.transform(X_train)
# Scale the Validation Set
X_val = scaler.transform(X_val)

# Scale the Test Set
X_test = scaler.transform(X_test)

## **Define Model**

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=15), random_state=1, n_estimators=1000, loss='exponential').fit(X_train, y_train)
print("Model Score: ",model.score(X_train, y_train))

## **Prediction on Validation Set**

In [None]:
y_pred = model.predict(X_val)
y_pred = y_pred.astype(int)
print("Validation Set Score: ",model.score(X_val, y_val))

### **Evaluation**

In [None]:
# Model Error Evaluation
from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(y_val, y_pred, squared=False)
print("Root Mean Suqared Error: ", rmse)

### **Model Prediction**

In [None]:
final_pred = model.predict(X_test)
final_pred[final_pred < 0] = 0
final_pred = final_pred.astype(int)
final_pred

### **Create DataFrame for Submission**

In [None]:
final_df = pd.DataFrame({'Id': id, 'SalePrice':final_pred})
final_df

### **Visualization of Actual and Predict Sale Price**

In [None]:
actual_price = pd.DataFrame(y, columns=['Actual'])
print("Pridected SalePrice Length: ", len(final_pred))
print("Actual SalePrice Length: ", len(actual_price))

In [None]:
# Create DataFrame to Visualization
df_visual = pd.DataFrame(final_pred, columns=['Predict'])
df_visual = pd.concat([actual_price, df_visual], axis=1)
df_visual = df_visual.drop(df_visual.index[1459])
df_visual

In [None]:
plt.subplots(figsize=(15, 8))
plt.plot(df_visual["Actual"])
plt.show()

In [None]:
plt.subplots(figsize=(15, 8))
plt.plot(df_visual["Predict"], color='green')
plt.show()

In [None]:
# Visualization
df_visual.plot(kind='line', figsize=(16,10))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='blue')
plt.grid(which='minor', linestyle='-', linewidth='0.5', color='green')
plt.show()


In [None]:
plt.subplots(figsize=(14, 8))
points = np.array(df_visual)
x_ax = df_visual['Actual']
y_ax = df_visual['Predict']
plt.xlabel("Actual")
plt.ylabel("Predict")
plt.scatter(x_ax, y_ax, color="#003F72")
plt.show()

### **After Taking Logarithm**
Actual and Predict SalePrice

In [None]:
# Create DataFrame to Visualization with taking Log
df_visual2 = pd.DataFrame(np.log1p(df_visual["Predict"]), columns=['Predict'])
df_visual2 = pd.concat([np.log1p(df_visual["Actual"]), df_visual2], axis=1)
df_visual2

In [None]:
plt.subplots(figsize=(15, 8))
plt.plot(df_visual2["Actual"])
plt.show()

In [None]:
plt.subplots(figsize=(15, 8))
plt.plot(df_visual2["Predict"], color='green')
plt.show()

In [None]:
# Visualization
df_visual2.plot(kind='line', figsize=(15,8))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='blue')
plt.grid(which='minor', linestyle='-', linewidth='0.5', color='green')
plt.show()


In [None]:
plt.subplots(figsize=(14, 8))
points = np.array(df_visual2)
x_ax = df_visual2['Actual']
y_ax = df_visual2['Predict']
plt.xlabel("Actual")
plt.ylabel("Predict")
plt.scatter(x_ax, y_ax, color="#003F72")
plt.show()

### **Export CSV File for Submission**

In [None]:
final_df.to_csv('myHousePrice.csv', index=False)
print('Submitted Successfully')

### **Read Output Submission**

In [None]:
print("\n\n============== [ DataFrames ] ==================\n\n")
import os
for dirname, _, filenames in os.walk('/kaggle'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
submission = pd.read_csv('/kaggle/working/myHousePrice.csv')
print("Submission File Shape: ", submission.shape)

In [None]:
submission

#### **Sure All Praises for Almighty Allah**
### Good Luck