In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!unzip /kaggle/input/sberbank-russian-housing-market/train.csv.zip

## Reading data

In [None]:
dataset = pd.read_csv(r'./train.csv')
pd.set_option('display.max_columns', 300)
dataset.head()

In [None]:
dataset.shape

# Descriptive Statistics

In [None]:
pd.set_option('display.max_rows', 300)
dataset.isnull().sum()

In [None]:
dataset.dtypes

In [None]:
dataset['timestamp'] = pd.to_datetime(dataset['timestamp'])
dataset['year'] = dataset['timestamp'].dt.year

In [None]:
dataset.drop(['id','timestamp','sub_area'],axis=1,inplace=True)

In [None]:
numerical_feature = [feature for feature in dataset.columns if dataset[feature].dtypes != "O"]
numerical_feature


In [None]:
for feature in numerical_feature:
    print("{} has null values :   {}".format(feature,dataset[feature].isnull().sum()))

In [None]:
categorical_feature = [feature for feature in dataset.columns if dataset[feature].dtypes == "O"]
categorical_feature

In [None]:
for feature in categorical_feature:
    print("{} has categorical features :   {}".format(feature,len(dataset[feature].unique())))

In [None]:
for feature in categorical_feature:
    print("{} has null values :   {}".format(feature,dataset[feature].isnull().sum()))

In [None]:
continuous_feature = [feature for feature in numerical_feature if len(dataset[feature].unique()) > 40]
continuous_feature

In [None]:
discrete_feature = [feature for feature in numerical_feature if len(dataset[feature].unique()) <= 40]
discrete_feature

In [None]:
for feature in discrete_feature:
    print("{} feature has null values : {}".format(feature,dataset[feature].isnull().sum()))

##  Univariate/Bivariate Analysis

In [None]:
for feature in categorical_feature:
    sns.barplot(x = dataset[feature],y = dataset['price_doc'])
    plt.xlabel(feature)
    plt.ylabel('price_doc')
    plt.show()


In [None]:
for feature in discrete_feature:
    sns.barplot(x = dataset[feature],y = dataset['price_doc'])
    plt.xlabel(feature)
    plt.ylabel('price_doc')
    plt.show()


In [None]:
for feature in continuous_feature:
    dataset[feature].hist(bins=30)
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.show()

In [None]:
missing_column = [feature for feature in dataset.columns if dataset[feature].isnull().sum() > 0 ]
missing_column

In [None]:
for feature in missing_column:
    sns.boxplot(dataset[feature])
    plt.xlabel(feature)
    plt.show()

In [None]:
dataset['build_year']

In [None]:
missing_column_numerical = [feature for feature in numerical_feature if dataset[feature].isnull().sum() > 0 ]
missing_column_numerical

In [None]:
missing_column_discrete = [feature for feature in numerical_feature if dataset[feature].isnull().sum() > 0 ]
missing_column_discrete

# Feature Engineering

In [None]:
for feature in missing_column_numerical:
    value = dataset[feature].median()
    dataset[feature].fillna(value,inplace=True)

In [None]:
for feature in missing_column_discrete:
    value = dataset[feature].mode()
    dataset[feature].fillna(value,inplace=True)

In [None]:
dataset.isnull().sum()

In [None]:
for feature in continuous_feature:
    if 0 in dataset[feature].unique():
        pass
    else:
        dataset[feature] = np.log(dataset[feature])
        dataset[feature].hist(bins=30)
        plt.xlabel(feature)
        plt.ylabel('Count')
        plt.show()

In [None]:
dataset.describe()

In [None]:
import matplotlib.pyplot as plt 
plt.rcParams["figure.figsize"] = (20,3)
sns.heatmap(dataset.corr(),annot= True,cmap= 'coolwarm')

In [None]:
data = dataset.drop("price_doc",axis=1)

In [None]:
def correlation(dataset,threshold):
    col_corr = set()
    corr_matrix = data.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i,j] >= threshold):
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

In [None]:
correlation_features = correlation(dataset,0.8)
correlation_features

In [None]:
dataset.drop(correlation_features,axis=1,inplace=True)

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
categorical_feature

In [None]:
feat = dataset[categorical_feature]
feat

In [None]:
for feature in categorical_feature:
    if feature == 'ecology':
        pass
    else:
        dataset[feature] = pd.get_dummies(dataset[feature],drop_first=True)

In [None]:
dataset

In [None]:
dataset.isnull().sum()

In [None]:
cor = dataset.corr()
cor

In [None]:
a = abs(cor['price_doc'])
result = abs(a[a>0.5])
result

In [None]:
imp_feature = []
for i in range(0,len(result)):
  if result.iloc[i] > 0.1:
    print(result[i])
    imp_feature.append(result[i])

In [None]:
len(imp_feature)

In [None]:
data = dataset.copy()

In [None]:
X = data.drop('price_doc',axis=1)
Y = data['price_doc']

In [None]:
X.drop('ecology',inplace=True,axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=50)

In [None]:
from xgboost import XGBRegressor
from xgboost import plot_importance

# fit model to training data
xgb_model = XGBRegressor(n_estimators=100)
xgb_model.fit(X_train, Y_train)

print("Feature Importances : ", xgb_model.feature_importances_)

# plot feature importance
plt.rcParams["figure.figsize"] = (20,50)
plot_importance(xgb_model)
plt.show()

In [None]:
sorted_idx = np.argsort(xgb_model.feature_importances_)[::-1]

In [None]:
temp = []
for index in sorted_idx:
    print([X_train.columns[index], xgb_model.feature_importances_[index]])
    temp.append([X_train.columns[index], xgb_model.feature_importances_[index]])

In [None]:
temp

In [None]:
not_imp_feature = []
for i in range(50,len(temp)):
  not_imp_feature.append(temp[i][0])

In [None]:
len(not_imp_feature)

In [None]:
X_train.drop(not_imp_feature,axis=1,inplace=True)

In [None]:
X_test.drop(not_imp_feature,axis=1,inplace=True)

In [None]:
X_test.shape

In [None]:
X_train.to_csv('train_data.csv')
X_test.to_csv('test_data.csv')

In [None]:
Y_train.to_csv('ytrain_data.csv')
Y_test.to_csv('ytest_data.csv')
