In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource
from bokeh.models.tools import HoverTool

# Explanation Code:

### id - Unique ID for each home sold
### date - Date of the home sale
### price - Price of each home sold
### bedrooms - Number of bedrooms
### bathrooms - Number of bathrooms, where .5 accounts for a room with a toilet but no shower
### sqft_living - Square footage of the apartments interior living space
### sqft_lot - Square footage of the land space
### floors - Number of floors
### waterfront - A dummy variable for whether the apartment was overlooking the waterfront or not
### view - An index from 0 to 4 of how good the view of the property was
### condition - An index from 1 to 5 on the condition of the apartment,
### grade - An index from 1 to 13, where 1-3 falls short of building construction and design, 7 has an average level of construction and design, and 11-13 have a high quality level of construction and design.
### sqft_above - The square footage of the interior housing space that is above ground level
### sqft_basement - The square footage of the interior housing space that is below ground level
### yr_built - The year the house was initially built
### yr_renovated - The year of the house’s last renovation
### zipcode - What zipcode area the house is in
### lat - Lattitude
### long - Longitude
### sqft_living15 - The square footage of interior housing living space for the nearest 15 neighbors
### sqft_lot15 - The square footage of the land lots of the nearest 15 neighbors

In [None]:
data = pd.read_csv('../input/kc-house-data/kc_house_data.csv')
data.head()
print(len(data))

# CLEAN:

## Look At First Column [id] it's Not Unique And 177 Row is a Duplicated

In [None]:
data1 =  data[~data['id'].duplicated()]

In [None]:
data['id'].nunique()

In [None]:
data1.dropna(inplace=True)

In [None]:
data1['date'] = data1['date'].apply(lambda x: x.split('T')[0])
data1['date'] = pd.to_datetime(data1['date'])

### is data1['price'] Normal Distribution Or Not?

In [None]:
from scipy import stats

In [None]:
ideal = stats.norm.rvs(size=len(data1['price']),loc=data1['price'].mean() ,scale=data1['price'].std()) 
icount, idevision = np.histogram(ideal, bins=100)

In [None]:
count, devision = np.histogram(data['price'], bins=100)

In [None]:
def kl(p, q):
    result = np.sum(np.where(np.logical_and(p!=0, q!=0), p*np.log(p/q), 0))
    return result

In [None]:
kl(count/17630, icount/17630)

In [None]:
data1['price'].plot(kind='hist', figsize=(20, 12), bins=100)

In [None]:
data1.hist(bins=100, figsize=(20, 12))

## Select Just 95% Of Data1['price']: 

alpha means = std
#### we now our data in normal distribution 65% = 1alpha, -1alpha, 95% = 2alpha, -2alpha, 99% = 3alpha, -3alpha

In [None]:
def just_95(x):
    if (x < (-2 * data1['price'].std())) or (x > (2 * data1['price'].std())):
        return np.nan
    else:
        return x

In [None]:
data1['price'] = data1['price'].apply(just_95)

In [None]:
data1['price'].isna().sum()

In [None]:
data1.dropna(inplace=True)

In [None]:
data1.hist(bins=100, figsize=(20, 12))

In [None]:
data1['bedrooms'].hist(bins=25)

In [None]:
data1['bathrooms'].value_counts()

In [None]:
#### sqft_lot  متراژ مربع فضای زمین
### sqft_living:متراژ مربع آپارتمانها فضای زندگی داخلی

In [None]:
data1

In [None]:
data1.drop('waterfront', axis=1, inplace=True)

In [None]:
### waterfront: آب نما - یک متغیر ساختگی برای اینکه آپارتمان مشرف به آب نما باشد یا نه
### 
###
### شاخص از 1 تا 13 ، جایی که 1-3 از ساخت و طراحی ساختمان کم است ، 7 دارای سطح ساخت و طراحی متوسط ​​است و 11-13 دارای سطح ساخت و طراحی با کیفیت بالا است.

In [None]:
data1['view'].value_counts()

In [None]:
data1['grade'].value_counts()

In [None]:
data1['condition'].value_counts()

In [None]:
data1['sqft_basement'].value_counts()

In [None]:
def renovated(x):
    if x > 0:
        return 1
    elif x <= 0:
        return 0
data1['yr_renovated'] = data1['yr_renovated'].apply(renovated)

In [None]:
data1['yr_renovated'].value_counts()

In [None]:
data1.drop(['id'], inplace=True, axis=1)

# VISUALIZE

In [None]:
data1

In [None]:
p = figure(plot_width=400, plot_height=400)
output_file('test.html')
p.circle(x='lat', y='long', source=data1,
       fill_color='orange',size='price')
show(p)

In [None]:
plt.figure(figsize=(10, 10))
s = round((np.sqrt(data1['price'][:1000]) / 50 ) - 5)
amount = data1['lat'][:1000].values
volume = data1['long'][:1000].values


plt.scatter(volume, amount, s=s,
                     vmin=-3, vmax=3, cmap="Spectral")

In [None]:
data2 = data1[:1000].copy()

In [None]:
bath_bed_flor = data2.groupby(['bathrooms', 'bedrooms', 'floors'])['price'].size()

In [None]:
bath_bed_flor.plot(kind='barh', figsize=(20, 20))

In [None]:
data1

### Return City Name:

In [None]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="geoapiExercises")

In [None]:
lat = data1['lat']
long = data1['long']

In [None]:
## THIS IS JUST FOR TEST:
#for i in lat:
#    for j in long:
#        location = geolocator.reverse(str(i)+','+str(j)) 
#        address = location.raw['address']
#        state = address.get('state', '')
#        zipcode = address.get('postcode')
#    print(state)

In [None]:
def retcityname(lat, long):  
    for i in lat:
        for j in long:
            location = geolocator.reverse(str(i)+','+str(j)) 
            address = location.raw['address']
            state = address.get('state', '')
    return state

In [None]:
## it takes long time because at every step start query
#data1['city_state'] = retcityname(lat, long) 

In [None]:
data1.drop(['lat', 'long', 'date', 'zipcode'],axis=1 , inplace=True)

In [None]:
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.model_selection import   train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
y = data1['price']

In [None]:
st = StandardScaler()

In [None]:
x = data1.iloc[:, 1:]

In [None]:
corr = x.corr()
sns.heatmap(corr, annot=True)

In [None]:
best_fea = SelectKBest(chi2, k=10)
best_fea.fit(x, y)
x.columns[best_fea.get_support()]

In [None]:
x_best = best_fea.transform(x)

### STANDARD SCALER

In [None]:
#x_best = st.fit_transform(x_best)
#print(x_best.shape)

#y = st.fit_transform(y.reshape(-1, 1))
#print(y.shape)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_best ,y)

# Catboostregressor

In [None]:
#from catboost import CatBoostRegressor
#model = CatBoostRegressor(iterations=1000, 
                          #depth=8, 
                          #learning_rate=0.02)

#model.fit(x_train, y_train)

#preds = model.predict(x_test)
#print(preds)
#from sklearn import metrics
#print(metrics.mean_absolute_error(y_test, preds))
#print(metrics.r2_score(y_test, preds))

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rn = RandomForestRegressor(n_estimators=3000, max_depth=10)
rn.fit(x_train, y_train)
pre = rn.predict(x_test)
print(metrics.mean_absolute_error(y_test, pre))
print(metrics.r2_score(y_test, pre))

In [None]:
import xgboost
xgb = xgboost.XGBRegressor(max_depth=10
                           , subsample=0.5, eta=0.1)
xgb.fit(x_train, y_train)
yp = xgb.predict(x_test)
print(metrics.mean_absolute_error(y_test, yp))
print(metrics.r2_score(y_test, yp))