In [None]:
#!unzip '/content/kc_house_data.csv.zip'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from scipy.stats import shapiro,kstest
from scipy.stats import skew
from scipy.stats import probplot
from scipy.stats import boxcox_normmax
from scipy.special import boxcox1p

In [None]:
data = pd.read_csv('../input/housesalesprediction/kc_house_data.csv')
data.head()

In [None]:
data.info(memory_usage='deep')

# Research and Preprocessing

In [None]:
print(data['date'].nunique())
data.date

In [None]:
label = LabelEncoder()
data['date'] = label.fit_transform(data['date'])
print(data['date'].dtype)
print(data['date'].max())

In [None]:
data.drop('id',axis=1,inplace=True)
data['long'] = abs(data['long'])
data.head()

In [None]:
for col in data.columns:
  print(col+" " + str(data[col].nunique()))

In [None]:
corr = data.corr()
plt.figure(figsize=(17,15))
sns.heatmap(corr,annot=True)

In [None]:
_,ax = plt.subplots(nrows=len(data.columns),ncols=1,figsize=(16,16))

plt.subplots_adjust(top=6,hspace=0.5)
for i,col in enumerate(data.columns):
  sns.histplot(data[col],ax=ax[i])
  ax[i].set_xlabel(col,fontsize=20)

In [None]:
def testing_on_normal_distrubution(df):
  for col in df.columns:
    print('P-value Shapiro Test for column {} : {}'.format(col,shapiro(df[col])[1]))
    print('P-value Test Kolmogorova-Smirnova for column {} : {}\n'.format(col,kstest(df[col],'norm')[1]))

testing_on_normal_distrubution(data)

In [None]:
def determine_skewness(df):
  for col in df.columns:
    print('Skewness = {} for column {}\n'.format(skew(df[col]),col))
determine_skewness(data)    

In [None]:
sns.set_style('darkgrid')
def plotting_4_chart(df,col):
  fig1 = plt.figure(figsize=(17,17))
  grid = GridSpec(nrows=4,ncols=3,figure=fig1)

  plt.subplots_adjust(hspace=0.5)

  fig1_ax1 = fig1.add_subplot(grid[0,:2])
  sns.histplot(df[col],ax=fig1_ax1)
  fig1_ax1.axvline(df[col].mean(),color='red',label='Mean')
  fig1_ax1.axvline(df[col].median(),color='orange',label='Median')
  fig1_ax1.legend()

  fig1_ax2 = fig1.add_subplot(grid[1,:2])
  probplot(data[col],plot=fig1_ax2)

  fig1_ax3 = fig1.add_subplot(grid[2:,:2])
  sns.residplot(x=col,y='price',data=df,ax=fig1_ax3)

  fig1_ax4 = fig1.add_subplot(grid[:,2])
  sns.boxplot(y=col,data=df,ax=fig1_ax4)

  fig2 = plt.figure(figsize=(15,15))
  grid2 = GridSpec(nrows=1,ncols=1,figure=fig2)

  fig2_ax = fig2.add_subplot(grid2[0,0])
  sns.histplot(df['price'],ax=fig2_ax)

plotting_4_chart(data,'sqft_living15')

# Different functions for Feature Selection

In [None]:
data.nunique()

In [None]:
from scipy.stats import f_oneway

cut1 = data.loc[data['condition']==1,'price']
cut2 = data.loc[data['condition']==2,'price']
cut3 = data.loc[data['condition']==3,'price']
cut4 = data.loc[data['condition']==4,'price']
cut5 = data.loc[data['condition']==5,'price']

f_oneway(cut1,cut2,cut3,cut4,cut5)

In [None]:
def anova_one_way(col,df=data):
  from scipy.stats import f_oneway
  len = df[col].nunique()
  all_means = [] 

  for i in range(len):
    element = df[col].unique()[i]
    new_col=df.loc[df[col]==element,'price']
    all_means.append(new_col)

  return f_oneway(*[x for x in all_means])[1]

anova_one_way('condition')     

In [None]:
def selected_anova(cols,df=data):
  assert type(cols) == list
  for col in cols:
    p_value = anova_one_way(col=col,df=df)
    print('P_value for column "{}" equal {}'.format(col,p_value))

selected_anova(df=data,cols=['condition','floors','waterfront','view','bedrooms','bathrooms','grade','lat'])

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
print(XGBClassifier)
print(RandomForestClassifier)
rf = RandomForestClassifier()
print(rf.__class__.__name__)

In [None]:
def selected_with_model(model,type_model='ensemble',data=data):
  model.fit(data.drop('price',axis=1),data['price'])
  if type_model=='ensemble':
    importance = model.feature_importances_
    cols = list(data.columns)
    cols.remove('price')
    dataframe = pd.DataFrame({'Cols':cols,'Importance':importance}).sort_values(ascending=False,by='Importance',ignore_index=True)
  elif type_model=='linear':
    importance = abs(model.coef_)
    cols = list(data.columns)
    cols.remove('price')
    dataframe = pd.DataFrame({'Cols':cols,'Importance':importance},index=cols).sort_values(ascending=False,by='Importance',ignore_index=True)
  
  plt.figure(figsize=(20,10))
  sns.barplot(x='Cols',y='Importance',data=dataframe)    

  return dataframe

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

lr = LinearRegression()
selected_with_model(lr,type_model='linear')

In [None]:
rf = RandomForestRegressor()
selected_with_model(model=rf,type_model='ensemble')

In [None]:
from lightgbm import LGBMRegressor
lgb = LGBMRegressor()
selected_with_model(model=lgb,type_model='ensemble')

# Modeling

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data.drop('price',axis=1),data['price'],test_size=0.33,shuffle=True,random_state=25)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error

lr = LinearRegression()

def rmse(prediction):
  return np.sqrt(mean_squared_error(prediction,y_test))

def progressive(model,X,y,X_test,flag = False):
  import time

  start = time.time()
  model.fit(X,y)
  pred = model.predict(X_test)
  if flag:
    pred= np.expm1(pred)
  print('Time: ' + str(time.time()-start))  
  print('MAE: '+ str(mean_absolute_error(y_test,pred)))
  print('MSE:' + str(rmse(pred)))

progressive(lr,X_train,y_train,X_test=X_test)  

In [None]:
y_train_log = np.log1p(y_train)
lr1 = LinearRegression()

progressive(lr1,X_train,y_train_log,flag=True,X_test=X_test)

In [None]:
X_train['long']

In [None]:
def deter_num_cols(df=X_train):
  num = []
  for col in X_train.columns:
    if df[col].nunique() > 100 and col!='long':
      num.append(col)
  return num      

In [None]:
def boxcox(dfs):
  assert type(dfs) == list

  for df in dfs:
    for col in deter_num_cols():
      df[col] = boxcox1p(df[col],0.15)

boxcox([X_train,X_test])      

In [None]:
print(X_train.isnull().sum())
print(y_train.isnull().sum())
print(y_train_log.isnull().sum())
print(X_train.isin([np.inf,-np.inf]).sum())

In [None]:
y_train_log = np.log1p(y_train)
lr2 = LinearRegression()

progressive(lr2,X_train,y_train_log,flag=True,X_test=X_test)

In [None]:
def detect_outliers(df,n):
  from collections import Counter
  outliers = []
  for col in deter_num_cols():
    Q1 = np.percentile(df[col],25)
    Q3 = np.percentile(df[col],75)
    IQR = Q3-Q1
    step = IQR*1.5

    outlier_index = df.loc[(df[col]<Q1 - step) | (df[col]>Q3 + step),col].index 
    outliers.extend(outlier_index)

  dictionary = Counter(outliers)

  multiple_outliers = [k for k,v in dictionary.items() if v>n]   
  return  multiple_outliers

In [None]:
print(len(detect_outliers(X_train,2)))
X_train_without_outliers = X_train.drop(detect_outliers(X_train,2),axis=0)
y_train_log_out = y_train_log.drop(detect_outliers(X_train,2),axis=0)

lr5 = LinearRegression()
progressive(lr5,X_train_without_outliers,y_train_log_out,flag=True,X_test=X_test)

In [None]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler
scaler_min_max =  MinMaxScaler()
standard_scaler = StandardScaler()

X_train_min_max = scaler_min_max.fit_transform(X_train)
X_test_min_max = scaler_min_max.transform(X_test)

X_train_standard = standard_scaler.fit_transform(X_train)
X_test_standard = standard_scaler.transform(X_test)

In [None]:
lr3 = LinearRegression()
lr4 = LinearRegression()

print(progressive(lr3,X_train_min_max,y_train_log,flag=True,X_test=X_test_min_max))
print()
print(progressive(lr4,X_train_standard,y_train_log,flag=True,X_test=X_test_standard))

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

progressive(rf,X_train,y_train,flag=False,X_test=X_test)

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor()

progressive(tree,X_train,y_train_log,flag=True,X_test=X_test)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=6)

progressive(knn,X_train,y_train,flag=False,X_test=X_test)

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

progressive(xgb,X_train,y_train,flag=False,X_test=X_test)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

progressive(gb,X_train,y_train,flag=False,X_test=X_test)

In [None]:
from lightgbm import LGBMRegressor

lgb = LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

progressive(lgb,X_train,y_train,flag=False,X_test=X_test)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import RobustScaler

ENet = make_pipeline(RobustScaler(),ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

progressive(ENet,X_train,y_train_log,flag=True,X_test=X_test)

In [None]:
from sklearn.ensemble import StackingRegressor

stacked1 = StackingRegressor(estimators=[('xgb',xgb),('lgb',lgb)],final_estimator=gb)

progressive(stacked1,X_train,y_train,flag=False,X_test=X_test)

In [None]:
stacked2 = StackingRegressor(estimators=[('xgb',xgb),('gb',gb)],final_estimator=gb)

progressive(stacked2,X_train,y_train,flag=False,X_test=X_test)

In [None]:
from sklearn.ensemble import VotingRegressor

vote = VotingRegressor(estimators=[('xgb',xgb),('lgb',lgb),('gb',gb)])

progressive(vote,X_train,y_train,flag=False,X_test=X_test)


# Modeling by handle

In [None]:
from sklearn.base import TransformerMixin,BaseEstimator,RegressorMixin,clone
from sklearn.model_selection import KFold

In [None]:
class AveragingRegressor(TransformerMixin,BaseEstimator,RegressorMixin):
  def __init__(self,models):
    self.models = models
  
  def fit(self,X,y):
    self.models_ = [clone(model) for model in self.models]

    for model in self.models_:
      model.fit(X,y)

    #return self      

  def predict(self,X):
    predictions = np.column_stack([model.predict(X) for model in self.models_])      
    return np.mean(predictions,axis=1)

In [None]:
averaged = AveragingRegressor(models=(gb,xgb,lgb))
progressive(averaged,X_train,y_train,X_test,flag=False)

In [None]:
class StackingAveragingRegressor(TransformerMixin,BaseEstimator,RegressorMixin):
  def __init__(self,base_models,meta_model,n_folds=5):
    self.base_models = base_models
    self.meta_model  = meta_model
    self.n_folds = n_folds

  def fit(self,X,y):
    self.base_models_ = [list() for model in self.base_models]
    self.meta_model_  = clone(self.meta_model)
    kfold = KFold(n_splits=self.n_folds,shuffle=True,random_state=42)

    out_of_fold_predictions = np.zeros((X.shape[0],len(self.base_models)))
    for i,model in enumerate(self.base_models):
      for train,validation in kfold.split(X,y):
        instance = clone(model)
        instance.fit(X[train],y[train])
        self.base_models_[i].append(instance)
        y_pred = instance.predict(X[validation])
        out_of_fold_predictions[validation,i] = y_pred

    self.meta_model_.fit(out_of_fold_predictions,y)
    #return self#Хотя нет не настолько важнож)

  def predict(self,X):
    predictions = np.column_stack([np.column_stack([model.predict(X) for model in base_models]).mean(axis=1) for base_models in self.base_models_])
    return self.meta_model_.predict(predictions)


In [None]:
stacked = StackingAveragingRegressor(base_models=(xgb,lgb),meta_model=gb)
progressive(stacked,X_train.values,y_train.values,X_test.values,flag=False)