# Imports

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn as sk
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")


# Steps
# <a href = "#1"> 1- Read Data <a>
# <a href = "#2"> 2- Data Exploring and Investigation <a>
# <a href = "#3"> 3- Feature Selection <a>
# <a href = "#4"> 4- Cleaning Training Data <a>  
# <a href = "#5"> 5- Cleaning Test Data <a>
# <a href = "#6"> 6- Ecode Training and Test Data <a>  
# <a href = "#7"> 7- Split Data <a>
# <a href = "#8"> 8- Apply Models <a>
# <a href = "#9"> 9- Linear Regression <a>
# <a href = "#10"> 10- Random Forest <a>
# <a href = "#11"> 11- GradientBoosting <a>
# <a href = "#12"> 12- Select Prameters <a>

# <a name = "1">1- Read Data<a>

## Note: I keep the label in data_with_label to investigate my data.

In [None]:
train_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv' , index_col= 'Id')
test_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)

Test_data_Id = test_data["Id"]

data_with_label = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv' , index_col= 'Id')
data_with_label.dropna(axis=0, subset=['SalePrice'], inplace=True)

# <a name = "2">2- Data Exploring and Investigation<a>

### Just checking my data

In [None]:
data_with_label.head()

### Check nulls and proper data types

In [None]:
data_with_label.info()

### Check most correlated features with the target

In [None]:
top_corr_features = data_with_label.corr()['SalePrice'].sort_values(ascending=False)
top_corr_features

### Check the skewness in the data

In [None]:
data_with_label.skew()

### Check the histogram to get how data skewed and nulls Visually will be butter.

In [None]:
data_with_label.hist(figsize = (20 ,20))[0]

### Check the correlation between each feature and the others

In [None]:
Correlation_Matrix = train_data.select_dtypes(np.number).corr()
fig = px.imshow(Correlation_Matrix, text_auto=True  , color_continuous_scale=px.colors.sequential.Viridis)
fig.layout.height = 1000
fig.layout.width = 1000
fig.show()

# <a name = "3"> 3- Feature Selection <a>

## Decide Which features Important depend on the exploration above.

In [None]:
categorical_feature = ["Utilities" ,"Neighborhood",  "Condition1" , "Condition2","BldgType" ,
               "HouseStyle"  , "Exterior1st",
               "Exterior2nd", "SaleType","SaleCondition"]

ordinal_feature = ["BsmtQual" ,"Foundation" ,"BsmtExposure"  ,"BsmtFinType1" ,"BsmtFinType2", "HeatingQC" ,
                  "CentralAir" ,"Electrical" ,"KitchenQual"]


numerical_feature = ["LotFrontage" , "LotArea" ,"BsmtFinSF1" ,"1stFlrSF" , "2ndFlrSF" , "FullBath",
                     "TotRmsAbvGrd"  ,"3SsnPorch","ScreenPorch", "PoolArea", "OverallQual","OverallCond" ,"YearRemodAdd"
                    ,"BsmtFinSF2","TotalBsmtSF" ,"GarageCars"]

training_features = categorical_feature+ordinal_feature+numerical_feature+["SalePrice"]
test_features = categorical_feature+ordinal_feature+numerical_feature

# <a name = "4"> 4- Cleaning Training Data <a> 

In [None]:
train_data = train_data[training_features].copy()
train_data.info()

## Fill missing values with mean if numeric and mode if catigorical

In [None]:
train_filling_null = { "BsmtQual" : train_data["BsmtQual"].mode().iloc[0]  ,
                "BsmtExposure" : train_data["BsmtExposure"].mode().iloc[0],
                "BsmtFinType1" : train_data["BsmtFinType1"].mode().iloc[0],
                "BsmtFinType2" : train_data["BsmtFinType2"].mode().iloc[0],    
               "LotFrontage" : train_data["LotFrontage"].mean()   }

train_data = (train_data.fillna(value = train_filling_null)).copy()
train_data = train_data.dropna(axis = 0)
train_data.head()
train_data.info()

# <a name = "5"> 4- Cleaning Test Data <a> 

In [None]:
test_data = test_data[test_features]
test_data.info()

In [None]:
test_filling_null = { "BsmtQual" : test_data["BsmtQual"].mode().iloc[0]  ,
                     "BsmtExposure" : test_data["BsmtExposure"].mode().iloc[0],
                     "BsmtFinType1" : test_data["BsmtFinType1"].mode().iloc[0],
                     "BsmtFinType2" : test_data["BsmtFinType2"].mode().iloc[0],    
                     "LotFrontage" : test_data["LotFrontage"].mean(),
                     "Utilities" : test_data["Utilities"].mode().iloc[0],
                     "Exterior1st" : test_data["Exterior1st"].mode().iloc[0],
                     "SaleType" : test_data["SaleType"].mode().iloc[0],
                     "Exterior2nd" : test_data["Exterior2nd"].mode().iloc[0],
                     "KitchenQual" : test_data["KitchenQual"].mode().iloc[0],
                     "BsmtFinSF1" : test_data["BsmtFinSF1"].mean(),
                     "BsmtFinSF2" : test_data["BsmtFinSF2"].mean(),
                     "TotalBsmtSF" : test_data["TotalBsmtSF"].mean(),
                     "GarageCars" : test_data["GarageCars"].mean(),
                     
                     
                    }

test_data = test_data.fillna(value = test_filling_null)
# test_data = test_data.dropna(axis = 0)
test_data.head()
test_data.info()

# <a name = "6"> 6- Ecode Training and Test Data <a>  

# One Hot Encoding with Training Data 

In [None]:
Onehot_Encoding = OneHotEncoder(sparse=False)
features_Onehot_encoded = pd.DataFrame(Onehot_Encoding.fit_transform(train_data[categorical_feature] ))
features_Onehot_encoded.columns = Onehot_Encoding.get_feature_names(categorical_feature)
# train_features = pd.concat([train_features , pd.DataFrame(features)],axis=1)
train_data[features_Onehot_encoded.columns] = features_Onehot_encoded
train_data = train_data.drop(train_data[categorical_feature] ,axis =1)
train_data = train_data.dropna()
train_data.head()

# Ordinal Encoding with Training Data 

In [None]:

ordinal_encoder = OrdinalEncoder()
ord_encoded_feature = pd.DataFrame(ordinal_encoder.fit_transform(train_data[ordinal_feature] ))

train_data[ord_encoded_feature.columns] = ord_encoded_feature
train_data = train_data.drop(train_data[ordinal_feature] ,axis =1)
# train_features = train_features.dropna()
train_data.head()

## Check data after encoding

In [None]:
train_data.shape

# One Hot Encoding test Data

In [None]:
t = pd.DataFrame(Onehot_Encoding.transform(test_data[categorical_feature] ))
t.columns = Onehot_Encoding.get_feature_names(categorical_feature)
test_data[t.columns] = t
test_data = test_data.drop(test_data[categorical_feature] ,axis =1)


In [None]:
ord_feature = pd.DataFrame(ordinal_encoder.fit_transform(test_data[ordinal_feature]))

test_data[ord_feature.columns] = ord_feature
test_data = test_data.drop(test_data[ordinal_feature] ,axis =1)
# test_data = test_data.dropna()
test_data.info()

In [None]:
test_data.shape

# Split SalePrice As label

In [None]:
train_labels = train_data.SalePrice
train_data.drop(['SalePrice'], axis=1, inplace=True)

# <a name = "7"> 7- Split Data <a>

## Split to train and validation to select the suitabel parameters

In [None]:
def split_data(train_features , train_labels):
    validation_set = train_features.iloc[1100:]
    validation_labels = train_labels.iloc[1100:]

    train_features = train_features.iloc[0:1100]
    train_labels = train_labels.iloc[0:1100]
    return validation_set , validation_labels , train_features , train_labels
    # validation_set.shape

## Fix train and test NaNs after the encoding

In [None]:
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

# 8- Apply Models<a name = "8">.<a>

# <a name = "9"> 9- Linear Regression <a>

In [None]:

reg = LinearRegression().fit(train_data, train_labels)
reg.score(train_data, train_labels)


In [None]:
LR_predection = reg.predict(test_data)
LiearReg_predection = pd.DataFrame()
LiearReg_predection['Id'] = Test_data_Id
LiearReg_predection['SalePrice'] = LR_predection[0:1459]
LiearReg_predection.shape

# <a name = "10"> 10- Random Forest <a>

In [None]:

reg = RandomForestRegressor(max_depth=19, random_state=0)
reg.fit(train_data, train_labels)
reg.score(train_data, train_labels)

In [None]:
RF_predection = reg.predict(test_data)
RandomForest_predection = pd.DataFrame()
RandomForest_predection['Id'] = Test_data_Id
RandomForest_predection['SalePrice'] = RF_predection[0:1459]
RandomForest_predection.shape

# <a name = "11"> 11- GradientBoosting <a>

In [None]:
reg = GradientBoostingRegressor(n_estimators=13, learning_rate= 0.0004,
max_depth=2, random_state=0).fit(train_data, train_labels)
reg.score(train_data, train_labels)

In [None]:
GB_predection = reg.predict(test_data)
GradientBoost_predection = pd.DataFrame()
GradientBoost_predection['Id'] = Test_data_Id
GradientBoost_predection['SalePrice'] = GB_predection[0:1459]
GradientBoost_predection.shape

# <a name = "12"> 12- Select Prameters <a>

# Graph of scores vs 10 Parameters of Max_depth

In [None]:
validation_set ,validation_labels , train_set , label_train_set =  split_data(train_data , train_labels)
listof_scores = []
list_of_paramters = [2 , 10 , 5 , 3 , 15 , 30 , 9 , 4 , 12 , 35]
for i in range(1,50):
    regrr = RandomForestRegressor(max_depth=i, random_state=0)
    regrr.fit(train_set, label_train_set)
    listof_scores.append(regrr.score(validation_set, validation_labels))
    
plt.plot(listof_scores)    
plt.xlabel("Score")
plt.ylabel("parameter")
plt.show

In [None]:
print(listof_scores)
print(listof_scores.index(max(listof_scores)))

# Graph of scores vs 10 alphas

In [None]:
validation_set ,validation_labels , train_set , label_train_set =  split_data(train_data , train_labels)
listof_scores = []
list_of_alpha = [0.1 , 0.5 , 0.8 , 0.55 , 0.001 , 1.0 , 0.009 , 0.0004 , 0.2 , 0.6]
for i in list_of_alpha:
    regrr = GradientBoostingRegressor(n_estimators=100, learning_rate=i,
            max_depth=2, random_state=0).fit(train_set, label_train_set)
    listof_scores.append(regrr.score(validation_set, validation_labels))
    
plt.plot(listof_scores)    
plt.xlabel("parameter")
plt.ylabel("Score")
plt.show

import warnings

def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()

# Save Predictions

In [None]:
RandomForest_predection.to_csv("predections.csv", index=None, sep=",")