In [None]:
import numpy as np;
import pandas as pd;
import seaborn as sns;
import matplotlib.pyplot as plt

pd.pandas.set_option('display.max_columns',None)
pd.pandas.set_option('display.max_rows',None)

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train.head(10)

In [None]:
test.head(10)

In [None]:
dataset = [train, test]

In [None]:
train.shape

In [None]:
test.shape

In [None]:
for data in dataset:
    sns.heatmap(data.isnull(), cbar=False)
    plt.figure(figsize=(100,100))
    plt.show()

In [None]:
def columns_with_null(data):
    features_with_greater_than_50=[]
    features_with_less_than_50=[]
    for feature in data.columns:
        if data[feature].isnull().mean()>=0.6:
            features_with_greater_than_50.append(feature)
            print(feature, ' has ', data[feature].isnull().mean()*100, '% of null data')

        if(data[feature].isnull().mean()<0.6 and data[feature].isnull().mean()>0):
            features_with_less_than_50.append(feature)
    return [features_with_greater_than_50, features_with_less_than_50];

In [None]:
train_with_null_features = columns_with_null(train)

In [None]:
test_with_null_features = columns_with_null(test)

In [None]:
null_features = [train_with_null_features, test_with_null_features]

In [None]:
for index in [0,1]:
    for index1 in [0,1]:
       print(len(null_features[index][index1]))

### Dropping data with more than 50% of null data

In [None]:
for index in [0,1]:
    dataset[index].drop(null_features[index][0], axis=1, inplace=True)

In [None]:
for data in dataset:
    sns.heatmap(data.isnull(), cbar=False)
    plt.figure(figsize=(100,100))
    plt.show()

In [None]:
sns.heatmap(test[null_features[1][1]].isnull())

In [None]:
print(len(null_features[0][1]))

In [None]:
print(len(null_features[1][1]))

In [None]:
final_null_value = [null_features[0][1], null_features[1][1]]

In [None]:
def get_Object_null_columns(data, index, table):
    column_values = [];
    for features in data[index]:
        if(table[features].dtypes == 'O'):
            column_values.append(features)
    return column_values;

def get_numerical_null_columns(data, index, table):
    column_values = [];
    for features in data[index]:
        if(table[features].dtypes != 'O'):
            column_values.append(features)
    return column_values;

In [None]:
train_Object_null_values = get_Object_null_columns(final_null_value, 0, train);
train_numerical_null_values = get_numerical_null_columns(final_null_value, 0, train)

test_Object_null_values = get_Object_null_columns(final_null_value, 1, train);
test_numerical_null_values = get_numerical_null_columns(final_null_value, 1, train)

In [None]:
print(len(train_numerical_null_values)+len(train_Object_null_values), ' ', len(test_Object_null_values)+len(test_numerical_null_values))

In [None]:
for features in train_Object_null_values:
    train[features] = train[features].fillna('Unknown')

for features in train_numerical_null_values:
    train[features] = train[features].fillna(train[features].mean(), inplace=False)

In [None]:
for features in test_Object_null_values:
    test[features] = test[features].fillna('Unknown')

for features in test_numerical_null_values:
    test[features] = test[features].fillna(test[features].mean(), inplace=False)

In [None]:
train_Object_null_values

### Fixing the categorical data

In [None]:
get_Object_null_columns(dataset, 0, train)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
test.head()

In [None]:
encoder = LabelEncoder()

In [None]:
for features in get_Object_null_columns(dataset, 0, train):
    train[features] = encoder.fit_transform(train[features])

for features in get_Object_null_columns(dataset, 1, test):
    test[features] = encoder.fit_transform(test[features])

In [None]:
train.head()

In [None]:
test.head()

In [None]:
def calc_years(data):  
    years=[]
    for features in train.columns:
        if 'Yr' in features or 'Year' in features:
            years.append(features)
    return years

In [None]:
train[calc_years(train)] = 2020-train[calc_years(train)]
test[calc_years(train)] = 2020-test[calc_years(test)]

In [None]:
train.shape

In [None]:
# from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler()
# train = pd.DataFrame(scaler.fit_transform(train.loc[:, train.columns != ('id','SalesPrice')]))
# test = pd.DataFrame(scaler.fit_transform(test.loc[:, test.columns != ('id')]))
# train['SalesPrice', 'id'] = 

In [None]:
train.head()

In [None]:
x = train.iloc[:,:-1]
y = train.iloc[:,-1]

In [None]:
y.head()

## Modelling

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
import xgboost

#### Cross Validation

In [None]:
from sklearn.model_selection import KFold;
from sklearn.model_selection import cross_val_score;
k_fold = KFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
regressor = xgboost.XGBRegressor()

In [None]:
regressor.fit(x,y)

In [None]:
regressor.predict(test)

In [None]:
## Hyper Parameter Optimization


n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
booster=['gbtree','gblinear']
learning_rate=[0.05,0.1,0.15,0.20]
min_child_weight=[1,2,3,4]
base_score=[0.25,0.5,0.75,1]

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    'booster':booster,
    'base_score':base_score
    }

In [None]:
# Set up the random search with 4-fold cross validation
clf = RandomizedSearchCV(estimator=regressor,
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=50,
            scoring = 'neg_mean_absolute_error',n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=42)

In [None]:
# scoring="accuracy"
# score=cross_val_score(clf, x, y, cv=k_fold, n_jobs=2, )
# print(np.mean(score))

In [None]:
clf.fit(x,y)

In [None]:
prediction = clf.predict(test)

In [None]:
check_data = pd.read_csv('sample_submission.csv')

In [None]:
check_data['SalePrice'].head()

In [None]:
sns.distplot(prediction-check_data['SalePrice'])

In [None]:
check_data.head()

In [None]:
# root_mean_squared_error(check_data['SalePrice'], prediction)

sns.scatterplot(check_data['SalePrice'], prediction)

In [None]:
prediction = pd.DataFrame(prediction)
sub_df = pd.read_csv('sample_submission.csv')
datasets=pd.concat([sub_df['Id'],prediction],axis=1)
datasets.columns=['Id', 'SalePrice']
datasets.to_csv('sample_submission1.csv', index=False)

In [None]:
from sklearn.metrics import mean_squared_error
import math

In [None]:
mse = mean_squared_error(check_data['SalePrice'], prediction)

rmse = math.sqrt(mse)

In [None]:
print(rmse)