# **Load Libraries And Data**

In [None]:
'''import my libraries '''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
import warnings 
warnings.filterwarnings('ignore')

In [None]:
'''load training dataset'''
train_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv') 
test_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
Sub = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')

**Aggregate data into a single file
for easy handling in analysis and cleaning**

In [None]:
Sub = Sub.drop(['Id'],axis =1)
test_df =pd.concat([test_df,Sub],axis = 1)
df = pd.concat([train_df,test_df],axis =0)
df.head()

In [None]:
df.describe()

**missing data**

In [None]:
df.isnull().sum()

**drop some useless data**

In [None]:
lis=['MiscFeature','Fence','PoolQC','Alley','BsmtFinSF2','3SsnPorch','MiscVal','LowQualFinSF','BsmtHalfBath']
df= df.drop(lis ,axis=1)
df.head()

# **EDA**

In [None]:
df.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1, figsize=(12,12))

In [None]:
df['SalePrice'].describe()

In [None]:
sns.distplot(df['SalePrice'])

**matrix of correlation coefficients between the features**

In [None]:
corr_matrix = df.corr()
corr_mat = df.drop('Id',axis=1).corr()
f, ax = plt.subplots(figsize=(12, 10)) 
cmap = sns.diverging_palette(230, 20, as_cmap=True) 
sns.heatmap(corr_matrix, annot=None ,cmap=cmap)

# **Feature ENG**

In [None]:
df.corr()['SalePrice'].abs()

**The 5 highest correlation rates with y**

In [None]:
C = corr_matrix.nlargest(5, 'SalePrice')['SalePrice'].index
for i in C : 
    var = i
    data = pd.concat([df['SalePrice'], df[var]], axis=1)
    data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000))

**drop the smallest correlation coefficients**

In [None]:
N = corr_mat.nsmallest((15),'SalePrice')['SalePrice'].index
for n in N :
    df = df.drop(n ,axis=1)

# **Clean Data**

In [None]:
cleaning = df.drop(['SalePrice'],axis = 1)
SalePrice = df['SalePrice']

**select numeric values for handling with missing data on it**

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_cols = cleaning.select_dtypes(include=numerics)
numeric_cols = numeric_cols.fillna(numeric_cols.mean())
numeric_cols.head()

**select categorical object values for handling with missing data on it**

In [None]:
categorical = ['object']
categorical_cols = cleaning.select_dtypes(include=categorical)
categorical_cols = categorical_cols.fillna('none')
categorical_cols = pd.get_dummies(categorical_cols )
categorical_cols.head()

**concatenate them again**

In [None]:
cleaned = pd.concat([numeric_cols,categorical_cols],axis= 1)
df = pd.concat([cleaned,SalePrice],axis = 1)

**split X ,y for Scaling**

In [None]:
tst_df = df.iloc[ 1460 : ,:-1]
X = df.iloc[:1460,:-1]
y = df.iloc[:1460,-1]


scl = Normalizer()
X = scl.fit_transform(X) 
tst_df = scl.fit_transform(tst_df)

# **Machine Learning Model**

**split X ,y into train and test**

In [None]:
X_train ,X_test ,y_train ,y_test = train_test_split(X, y , test_size = 0.3, random_state = 4)

**Linear Regression model**

In [None]:
# LN = LinearRegression()
# LN.fit(X_train,y_train)
# y_pred = LN.predict(X_test)
# LN.score(X_train, y_train)

**SGDRegressor model**

In [None]:
# SGD =SGDRegressor()
# SGD.fit(X_train,y_train)
# y_pred = SGD.predict(X_test)
# SGD.score(X_train, y_train)

**Support Victor Regression model**

In [None]:
# svr = SVR(gamma='scale', C=0.00000001, epsilon=0.2)
# svr.fit(X_train,y_train)
# y_pred = svr.predict(X_test)
# svr.score(X_train, y_train)

**Gradient Boosting Regressor ensemble model**

In [None]:
g = GradientBoostingRegressor(n_estimators = 170, learning_rate = 0.4,max_depth = 2)
train = g.fit(X_train,y_train)
score = g.score(X_train,y_train)
percentage = "{:.0%}".format(score)
y_pred = g.predict(X_test)
#print('MAE:', metrics.mean_absolute_error(y_test, y_pred))  
#print('MSE:', metrics.mean_squared_error(y_test, y_pred))  
#print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('VarScore:',metrics.explained_variance_score(y_test,y_pred))
print('Acc_Score:',percentage)

# **visualization score**

In [None]:
fig, ax = plt.subplots(figsize=(30,10))
ax.plot(range(len(y_test)), y_test, '-b',label='Actual')
ax.plot(range(len(y_pred)), y_pred, 'r', label='Predicted')
fig = plt.figure(figsize=(10,5))
plt.scatter(y_test,y_pred) 
plt.plot(y_test,y_test,'r')
plt.show()

# **Submission**

In [None]:
y_predw = train.predict(tst_df)
Submission = pd.DataFrame({ 'Id': test_df['Id'],
                            'SalePrice': y_predw })
Submission.to_csv("Submission.csv", index=False)

In [None]:
Submission.shape