In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings   # remove all warnings from the output
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 500)  ## this will display all the columns in the output of .csv file
pd.set_option("display.max_rows", 500)

##### This is a helper function which plot countplot and percentage of null values available in a feature.

In [None]:
def null(df, feature, plot=False):
    t = df[feature].isna().mean() * 100
    print(f'% of null --> {t}')
    if plot:
        sns.countplot(dataset[feature], palette='cool')

In [None]:
train_path = '../input/house-prices-advanced-regression-techniques/train.csv'
test_path = "../input/house-prices-advanced-regression-techniques/test.csv"

## Extrapolatory Data Analysis

### Looking into the data

In [None]:
train_data = pd.read_csv(train_path)
train_data.head()

In [None]:
train_data.info()

In [None]:
dataset = train_data.copy()  # copying the data, so that if bymistakely we did something wrong which cannot be revert,
                             # then we can simply run this cell and we get back our origional data.
dataset.tail()

### Looking for null values

In [None]:
dataset.isnull().sum()

#### features which contains null values

In [None]:
null_feat = [i for i in dataset.columns if dataset[i].isnull().sum() != 0]
null_feat

In [None]:
for i in null_feat:
    print(f"{i} --> {dataset[i].unique()}")

## Feature Engineering

### Looking into each feature containing null values

### LotFrontage

In [None]:
dataset['LotFrontage'].fillna(dataset['LotFrontage'].mean(), inplace=True)
dataset['LotFrontage'].isnull().sum()

### Alley

In [None]:
dataset['Alley'].value_counts()

In [None]:
null(dataset, 'Alley', True) ## --> 93.7671% are missing.

### Drop this column is good bcz it is 93% null

In [None]:
dataset.drop('Alley', axis=1, inplace = True)

### MasVnrType

In [None]:
null(dataset, 'MasVnrType', True)

### put nan as 'None'

In [None]:
dataset['MasVnrType'].fillna('None', inplace=True)

### MasVnrArea

In [None]:
dataset['MasVnrArea'].unique()

In [None]:
null(dataset, 'MasVnrArea')

In [None]:
dataset['MasVnrArea'].fillna(dataset['MasVnrArea'].mean(), inplace=True)

### BsmtQual

In [None]:
null(dataset, 'BsmtQual', True)

In [None]:
dataset['BsmtQual'].fillna('TA', inplace=True)

### BsmtCond

In [None]:
null(dataset, 'BsmtCond', True)

In [None]:
dataset['BsmtCond'].fillna('TA', inplace=True)

In [None]:
null_feat

### BsmtExposure

In [None]:
null(dataset, 'BsmtExposure', True)

In [None]:
dataset['BsmtExposure'].fillna('No', inplace=True)

### BsmtFinType1

In [None]:
null(dataset, 'BsmtFinType1', True)

In [None]:
dataset['BsmtFinType1'].fillna('Unf', inplace=True)

### 'BsmtFinType2'

In [None]:
null(dataset, 'BsmtFinType2', True)

In [None]:
dataset['BsmtFinType2'].fillna('Unf', inplace=True)

### 'Electrical'

In [None]:
null(dataset, 'Electrical', True)

In [None]:
dataset['Electrical'].unique()

In [None]:
dataset['Electrical'].fillna('SBrkr', inplace=True)

### FireplaceQu

In [None]:
null(dataset, "FireplaceQu", True)

In [None]:
dataset['FireplaceQu'].fillna('None', inplace=True)

### GarageType

In [None]:
null(dataset, 'GarageType', True)

In [None]:
val = dataset['GarageType'].unique()[0]
dataset['GarageType'].fillna(val, inplace=True)

### GarageYrBlt

In [None]:
null(dataset, 'GarageYrBlt')

In [None]:
plt.figure(figsize=(18,8))
sns.countplot(dataset['GarageYrBlt'], palette='rainbow')
plt.xticks(rotation=45)
plt.show()

In [None]:
dataset['GarageYrBlt'].fillna('2005.0', inplace= True)

### GarageFinish

In [None]:
null(dataset, 'GarageFinish', True)

In [None]:
dataset['GarageFinish'].fillna(dataset['GarageFinish'].unique()[1], inplace=True)

### GarageQual

In [None]:
null(dataset, 'GarageQual', True)

In [None]:
dataset['GarageQual'].fillna(dataset['GarageQual'].unique()[0], inplace=True)

### GarageCond

In [None]:
null(dataset, 'GarageCond', True)

In [None]:
dataset['GarageCond'].fillna(dataset['GarageCond'].unique()[0], inplace=True)

### PoolQC

In [None]:
null(dataset, 'PoolQC', True)

#### droping...

In [None]:
dataset.drop('PoolQC', axis=1, inplace= True)

### Fence

In [None]:
null(dataset, 'Fence', True)

In [None]:
dataset['Fence'].fillna('None', inplace=True)

### MiscFeature

In [None]:
null(dataset, 'MiscFeature', True)

#### droping...

In [None]:
dataset.drop('MiscFeature', axis=1, inplace=True)

In [None]:
dataset.head(2)

In [None]:
dataset.shape

In [None]:
dropped_col = list(set(train_data.columns) - set(dataset.columns))
dropped_col

## dropped columns are : ['Alley', 'MiscFeature', 'PoolQC']

In [None]:
plt.scatter(dataset['LotFrontage'], dataset['SalePrice'])

In [None]:
plt.scatter(dataset['MasVnrArea'], dataset['SalePrice'])

In [None]:
plt.scatter(dataset['GarageCond'], dataset['SalePrice'])

In [None]:
dataset['GarageCond'].value_counts()

#### basically we have two types of columns 1.Numerical 2.Categorical.
#### putting all numerical columns into a list

In [None]:
num_feat = [i for i in dataset.columns if dataset[i].dtypes != 'O']
len(num_feat)

In [None]:
dataset[set(dataset.columns) - set(num_feat)].head(2)

In [None]:
dataset[set(dataset.columns) - set(num_feat)].dtypes

In [None]:
dataset['GarageYrBlt'] = dataset['GarageYrBlt'].astype(float)
dataset['GarageYrBlt'] = dataset['GarageYrBlt'].astype(int)

#### Time to work with categorical data

In [None]:
cat_feat = dataset[set(dataset.columns) - set(num_feat)]

In [None]:
dataset[num_feat].head(2)

In [None]:
float_feat = [i for i in num_feat if dataset[i].dtypes == 'float']
float_feat

In [None]:
for i in cat_feat:
    print(i)
    print(dataset[i].unique())
    print("------------------------------------")

## Encoding categorical data
##### Beacause machine learning algorithm are compatible with numerical data only.

In [None]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
for i in cat_feat:
    dataset[i] = label.fit_transform(dataset[i])

In [None]:
dataset.head(2)

In [None]:
dataset.dtypes

In [None]:
dataset.to_csv('final_dataset',header=True, index=False)

In [None]:
final_dataset = pd.read_csv('./final_dataset')
final_dataset.head(2)

## Variable Seperation

In [None]:
X_train = final_dataset.drop('SalePrice', axis=1)
y_train = final_dataset['SalePrice']

## Scaling the data

In [None]:
columns = X_train.columns
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
scaled = scale.fit(X_train)
X_train_scaled = pd.DataFrame(scale.transform(X_train), columns=columns)

In [None]:
X_train_scaled.head(2)

## Working with Test Data

In [None]:
test_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
test_data.head(2)

In [None]:
test_data.drop(dropped_col, axis=1, inplace=True)

In [None]:
label = LabelEncoder()
for i in cat_feat:
    test_data[i] = label.fit_transform(test_data[i])

In [None]:
test_data.head()

In [None]:
test_data['LotFrontage'].fillna(test_data['LotFrontage'].mean(), inplace=True)

test_data['MasVnrType'].fillna('None', inplace=True)

test_data['MasVnrArea'].fillna(test_data['MasVnrArea'].mean(), inplace=True)

test_data['BsmtQual'].fillna('TA', inplace=True)

test_data['BsmtCond'].fillna('TA', inplace=True)

test_data['BsmtExposure'].fillna('No', inplace=True)

test_data['BsmtFinType1'].fillna('Unf', inplace=True)

test_data['BsmtFinType2'].fillna('Unf', inplace=True)

test_data['Electrical'].fillna('SBrkr', inplace=True)

test_data['FireplaceQu'].fillna('None', inplace=True)

val = test_data['GarageType'].unique()[0]
test_data['GarageType'].fillna(val, inplace=True)

test_data['GarageYrBlt'].fillna('2005.0', inplace= True)

test_data['GarageFinish'].fillna(test_data['GarageFinish'].unique()[1], inplace=True)

test_data['GarageQual'].fillna(test_data['GarageQual'].unique()[0], inplace=True)

test_data['GarageCond'].fillna(test_data['GarageCond'].unique()[0], inplace=True)

test_data['Fence'].fillna('None', inplace=True)

In [None]:
test_data.isnull().sum()

In [None]:
test_null = [i for i in test_data.columns if test_data[i].isnull().sum() > 0]
test_null

In [None]:
test_data[test_null].dtypes

In [None]:
for i in test_null:
    test_data[i].fillna(test_data[i].mean(), inplace = True)

In [None]:
test_data.isnull().sum()

In [None]:
test_data.shape

## Model Fitting

In [None]:
X_test = scale.fit(test_data)
X_test_scaled = pd.DataFrame(scale.transform(test_data), columns=columns)

In [None]:
X_test_scaled.head(2)

In [None]:
from sklearn.ensemble import AdaBoostRegressor
lin_reg = AdaBoostRegressor(random_state = 24, n_estimators = 35)
model = lin_reg.fit(X_train_scaled, y_train)
model.score(X_train_scaled, y_train)

## Submission

In [None]:
sample = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
y_pred = model.predict(X_test_scaled)
result = pd.DataFrame({sample.columns[0] : sample['Id'],
                        sample.columns[1] : y_pred})
result.to_csv('submission.csv', index=False)