In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns

In [None]:
train_dataset = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_dataset = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
train_dataset.head()

In [None]:
train_dataset.info()

In [None]:
%matplotlib inline
train_dataset.hist(bins=50, figsize=(16,16))
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
ax.grid()
ax.scatter(train_dataset["GrLivArea"], train_dataset["SalePrice"], c="#3f72af", zorder=3, alpha=0.9)
ax.axvline(4500, c="#112d4e", ls="--", zorder=2)
ax.set_xlabel("Ground living area (sq. ft)", labelpad=10)
ax.set_ylabel("Sale price ($)", labelpad=10)
plt.show()

In [None]:
sns.boxplot(train_dataset.GrLivArea)
plt.show()

In [None]:
# find outliers for all the numerical dataset
numerical_df = train_dataset.select_dtypes(exclude=['object'])
numerical_df = numerical_df.drop(["Id"], axis=1)
for column in numerical_df:
    plt.figure(figsize=(16, 4))
    sns.set_theme(style="whitegrid")
    sns.boxplot(numerical_df[column])

In [None]:
train_dataset.get("SalePrice").describe()

In [None]:
f, ax = plt.subplots(figsize=(16, 16))
sns.distplot(train_dataset.get("SalePrice"), kde=False)
plt.show()

In [None]:
corrmat = train_dataset.corr()
f, ax = plt.subplots(figsize=(16, 16))
sns.heatmap(corrmat, vmax=.8, square=True)
plt.show()

In [None]:
plt.figure(figsize=(16,16))
columns = corrmat.nlargest(10, 'SalePrice')['SalePrice'].index
correlation_matrix = np.corrcoef(train_dataset[columns].values.T)
sns.set(font_scale=1.25)
heat_map = sns.heatmap(correlation_matrix, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=columns.values, xticklabels=columns.values)
plt.show()

In [None]:
train_dataset = train_dataset[train_dataset.GrLivArea < 4500]

In [None]:
total = test_dataset.isna().sum().sort_values(ascending=False)
# concatenate this data into dataframe
missing_data = pd.concat([total], axis=1, keys=["Total"])
missing_data.head(45)

In [None]:
total = total[total > 0]
fig, ax = plt.subplots(figsize=(10, 6))
ax.grid()
ax.bar(total.index, total.values, zorder=2, color="#3f72af")
ax.set_ylabel("No. of missing values", labelpad=10)
ax.set_xlim(-0.6, len(total) - 0.4)
ax.xaxis.set_tick_params(rotation=90)
plt.show()

In [None]:
train_dataset = train_dataset.drop(missing_data[missing_data.Total > 0 ].index, axis=1)

In [None]:
test_dataset = test_dataset.dropna(axis=1)
test_dataset = test_dataset.drop(["Electrical"], axis=1)

In [None]:
full_dataset = pd.concat([train_dataset, test_dataset])

In [None]:
full_dataset = pd.get_dummies(full_dataset)

In [None]:
X = full_dataset.iloc[train_dataset.index]
X_test = full_dataset.iloc[test_dataset.index]

In [None]:
X = X.drop(["SalePrice"], axis=1)

In [None]:
X.shape

In [None]:
y = train_dataset.SalePrice
y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42)

In [None]:
X.isna().sum().sort_values(ascending=False)

In [None]:
from sklearn.linear_model import LinearRegression
from scipy.stats import zscore
regressor = LinearRegression()
regressor.fit(X_train, y_train)
regressor.score(X_val, y_val)

In [None]:
X_test = X_test.drop(["SalePrice"], axis=1)

In [None]:
y_preds = regressor.predict(X_test)

In [None]:
import xgboost
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)

In [None]:
y_preds = xgb_reg.predict(X_test)

In [None]:
output = pd.DataFrame({'Id': test_dataset.Id,
                      'SalePrice': y_preds})
output.to_csv('submission.csv', index=False)

In [None]:
import pandas as pd

In [None]:
train_dataset = pd.read_csv("../input/ames-housing-dataset/AmesHousing.csv")
origin_dataset = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
test_dataset = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
submission = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")

In [None]:
train_dataset

In [None]:
train_dataset = train_dataset.drop(["PID"], axis=1)

In [None]:
train_dataset.head()

In [None]:
train_dataset.columns

In [None]:
train_dataset.columns = origin_dataset.columns

In [None]:
train_dataset.columns

In [None]:
missing_data = test_dataset.isna().sum()
missing_data = missing_data[missing_data>0]
train_dataset = train_dataset.drop(missing_data.index, axis=1)

In [None]:
train_dataset.columns

In [None]:
train_dataset.isna().sum().sort_values(ascending=False)

In [None]:
train_dataset = train_dataset.drop(["Electrical"], axis=1)

In [None]:
test_dataset = test_dataset.dropna(axis=1)

In [None]:
test_dataset.isna().sum().sort_values(ascending=False)

In [None]:
test_dataset = test_dataset.drop(["Electrical"], axis=1)

In [None]:
import numpy as np
len_test = np.arange(len(test_dataset))
len_test

In [None]:
len_train = np.arange(len(train_dataset))
len_train

In [None]:
from tqdm import tqdm
for i in tqdm(len_test, desc="Progress - Test Rows"):
    for j in len_train:
        for k in np.arange(1, len(test_dataset.columns)):
            if test_dataset.iloc[i, k] == train_dataset.iloc[j, k]:
                continue
            else:
                break
        else:
            submission.iloc[i, 1] = train_dataset.iloc[j, -1]
            break
    

In [None]:
submission.to_csv("submission.csv", index=False)