In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
os.chdir('/kaggle/input/sberbank-russian-housing-market')
from zipfile import ZipFile

zip_file = ZipFile('train.csv.zip')
dfs = {text_file.filename: pd.read_csv(zip_file.open(text_file.filename)) for text_file in zip_file.infolist()
       if text_file.filename.endswith('.csv')}
df = dfs['train.csv']
df.shape
# os.listdir()

In [None]:
with open ('/kaggle/input/sberbank-russian-housing-market/data_dictionary.txt', 'r') as dicter:
    vals = dicter.read()

In [None]:
print(vals)

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from mpl_toolkits.mplot3d import Axes3D
from collections import Counter

In [None]:
df.head()

In [None]:
Counter(df.dtypes)

In [None]:
df.select_dtypes(include='O').head()

In [None]:
df.groupby('product_type')['sub_area'].count()

In [None]:
majors = [col.split('_')[0] for col in list(df)]
pd.DataFrame(Counter(majors), index = ['val']).T.sort_values('val', ascending = False).head(20)

In [None]:
df[[col for col in list(df) if 'church' in col]]

In [None]:
sns.scatterplot(df.big_church_km, df.mosque_km, alpha = 0.3);

In [None]:
df[[col for col in list(df) if 'cafe_avg_price' in col]]

In [None]:
sorted([col for col in list(df) if 'cafe' in col])

In [None]:
pd.DataFrame(df.groupby('sub_area')['cafe_avg_price_500'].median()).reset_index().sort_values('cafe_avg_price_500', ascending=False)

In [None]:
df.price_doc.hist(bins = 300)
plt.xlim(0,4*1e7);

In [None]:
df['ppsm'] = df.price_doc / (df.full_sq + 1)

In [None]:
sns.distplot(df.ppsm.fillna(-1), bins = 300, kde = False)
plt.title('PRICE PER SQUARE METER')
plt.xlim(0,0.4*1e6);

In [None]:
print(f'median price of 1 sq meter in Moscow in 2011 {int(1.326463e+05/30)} USD')

In [None]:
pd.DataFrame(df.groupby('sub_area')['ppsm'].median()).reset_index().sort_values('ppsm', ascending = False).head(20)

In [None]:
pd.DataFrame(df.groupby('sub_area')['ppsm'].median()).reset_index().sort_values('ppsm', ascending = False).tail(20)

In [None]:
# df.full_sq.describe()

In [None]:
df.full_sq.hist(bins = 300)
plt.xlim(0,400);

In [None]:
df.life_sq.hist(bins = 300)
plt.xlim(0,400);

In [None]:
sns.scatterplot(data = df, x = 'full_sq', y = 'life_sq')
plt.xlim(0,500)
plt.ylim(0,500);

In [None]:
df[df.life_sq > df.full_sq].groupby('product_type')['ppsm'].median()

In [None]:
df.floor.hist(bins = 300)
plt.xlim(0,50);

In [None]:
ndf = df.fillna(0)
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(ndf[[col for col in list(df) if 'cafe' in col]])
df['tsne-2d-one'] = tsne_results[:,0]
df['tsne-2d-two'] = tsne_results[:,1]
plt.figure(figsize=(16,10))

In [None]:
# df.sub_area.nunique()

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 15, 15
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="sub_area",
    palette=sns.color_palette("hls", df.sub_area.nunique()),
    data=df,
    legend=None,
    alpha=0.3);

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 20, 20
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="product_type",
    palette=sns.color_palette("hls", df.product_type.nunique()),
    data=df,
#     legend='Full',
    alpha=0.3);

In [None]:
sdf = df.sample(n = 1000)
fig, ax = plt.subplots()
ax.scatter(sdf["tsne-2d-one"], sdf["tsne-2d-two"])

for i, txt in sdf.iterrows():
    ax.annotate(txt[['sub_area']].values[0], (txt[["tsne-2d-one"]].values[0], txt[["tsne-2d-two"]].values[0]))

In [None]:
featcols = list(df.select_dtypes(exclude='O'))[1:-2]

In [None]:
len(featcols)

In [None]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(ndf[featcols].values)
df['pca-one'] = pca_result[:,0]
df['pca-two'] = pca_result[:,1] 
df['pca-three'] = pca_result[:,2]
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
indices = ['PC-1','PC-2', 'PC-3']
pcs = pd.DataFrame(pca.components_,columns=featcols,index = ['PC-1','PC-2', 'PC-3']).T
for p in indices:
    pcs[p] = pcs[p].apply(lambda x: abs(x))
# pcs[indices] = pcs[indices] * 1e25
pcs.sort_values(indices[0], ascending = False).head(20)

In [None]:
# APPLYING NORMALIZATION AND REDOING STUFF
from sklearn.preprocessing import normalize
nndf = pd.DataFrame(normalize(df[featcols].fillna(-1)), columns= featcols)

In [None]:
#DROPPING PRICE

In [None]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(nndf[featcols[:-2]].values)
df['pca-one'] = pca_result[:,0]
df['pca-two'] = pca_result[:,1] 
df['pca-three'] = pca_result[:,2]
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
indices = ['PC-1','PC-2', 'PC-3']
pcs = pd.DataFrame(pca.components_,columns=featcols[:-2],index = ['PC-1','PC-2', 'PC-3']).T
for p in indices:
    pcs[p] = pcs[p].apply(lambda x: abs(x))
# pcs[indices] = pcs[indices] * 1e25
pcs.sort_values(indices[0], ascending = False).head(20)

In [None]:
from sklearn.preprocessing import RobustScaler
rb = RobustScaler()

In [None]:
idf = df[df.product_type != 'Investment']
ridf = pd.DataFrame(rb.fit_transform(idf[featcols[:-2]].fillna(-1)), columns= featcols[:-2])
ridf['price'] = idf['price_doc'].copy(deep = True)

In [None]:
idf.full_sq.describe()

In [None]:
ridf.full_sq.describe()

In [None]:
from sklearn.mixture import GaussianMixture as GM
from sklearn.cluster import DBSCAN
from sklearn.metrics import davies_bouldin_score, silhouette_score

In [None]:
vals = []
ft= featcols[:-2]
for i in range(2,30):
    gm = GM(n_components=i,n_init=10,tol=1e-3,max_iter=1000).fit(ridf[ft])
    idx = {'IDX': i,
           'BIC': gm.bic(ridf[ft]),
           'BDS': davies_bouldin_score(ridf[ft], gm.predict(ridf[ft])),
            'SS': silhouette_score(ridf[ft], gm.predict(ridf[ft]))}
    vals.append(idx)

In [None]:
scores = pd.DataFrame(vals)

In [None]:
plt.plot(scores['SS'], color = 'red');
plt.plot(scores['BDS'], color = 'orange');

In [None]:
plt.plot(scores['BIC'], color = 'blue')

In [None]:
# vals = []
ft= ['life_sq', 'floor',
 'max_floor',
 'num_room',]

In [None]:
nn = df[ft+['product_type']].dropna()

In [None]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(normalize(nn[ft].values))
nn['pca-one'] = pca_result[:,0]
nn['pca-two'] = pca_result[:,1] 
nn['pca-three'] = pca_result[:,2]
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
pcas = ['pca-one', 'pca-two', 'pca-three']

In [None]:
indices = ['PC-1','PC-2', 'PC-3']
pcs = pd.DataFrame(pca.components_,columns=ft,index = ['PC-1','PC-2', 'PC-3']).T
for p in indices:
    pcs[p] = pcs[p].apply(lambda x: abs(x))
# pcs[indices] = pcs[indices] * 1e25
pcs.sort_values(indices[0], ascending = False).head(20)

In [None]:
vals = []
for i in range(2,8):
    gm = GM(n_components=i,n_init=10,tol=1e-3,max_iter=1000).fit(nn[pcas])
    idx = {'IDX': i,
           'GMM':gm,
           'BIC': gm.bic(nn[pcas]),
           'BDS': davies_bouldin_score(nn[pcas], gm.predict(nn[pcas])),
            'SS': silhouette_score(nn[pcas], gm.predict(nn[pcas]))}
    vals.append(idx)
scores = pd.DataFrame(vals)

In [None]:
plt.plot(scores['SS'], color = 'red')
plt.plot(scores['BDS'], color = 'orange');

In [None]:
dbsc = DBSCAN()
dbsc.__dict__

In [None]:
for ep in [0.1, 0.2, 0.3]:
    dbsc = DBSCAN(eps = ep)
    dbsc.fit(normalize(nn[ft]))
    nn[f'dbc{ep}'] = dbsc.labels_

In [None]:
nn['dbc0.3'].unique()

In [None]:
for col in list(nn):
    if 'dbc' in col:
        print(f' {col} number of clusters {nn[col].nunique()} \n unclusterables {nn[nn[col] == -1].shape[0]} \n silhouette score: { silhouette_score(nn[ft], nn[col])}\n\n')

In [None]:
sns.scatterplot(
    x="pca-one", y="pca-two",
    hue="dbc0.1",
    palette=sns.color_palette("hls", 7),
    data=nn,
    legend='full',
    alpha=0.3
)

plt.show()

In [None]:
nn.groupby('dbc0.1')[ft].agg(['median']).T

In [None]:
nn.head()

In [None]:
nn['dbc0.1'].unique()

In [None]:
from time import time
import numpy as np
from scipy import ndimage
from matplotlib import pyplot as plt

from sklearn import manifold, datasets

X, y = datasets.load_digits(return_X_y=True)
n_samples, n_features = X.shape

np.random.seed(0)


In [None]:

def nudge_images(X, y):
    shift = lambda x: ndimage.shift(x.reshape((8, 8)),
                                  .3 * np.random.normal(size=2),
                                  mode='constant',
                                  ).ravel()
    X = np.concatenate([X, np.apply_along_axis(shift, 1, X)])
    Y = np.concatenate([y, y], axis=0)
    return X, Y


X, y = nudge_images(X, y)




In [None]:
def plot_clustering(X_red, labels, title=None):
    x_min, x_max = np.min(X_red, axis=0), np.max(X_red, axis=0)
    X_red = (X_red - x_min) / (x_max - x_min)

    plt.figure(figsize=(6, 4))
    for i in range(X_red.shape[0]):
        plt.text(X_red[i, 0], X_red[i, 1], str(y[i]),
                 color=plt.cm.nipy_spectral(labels[i] / 10.),
                 fontdict={'weight': 'bold', 'size': 9})

    plt.xticks([])
    plt.yticks([])
    if title is not None:
        plt.title(title, size=17)
    plt.axis('off')
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])



In [None]:
print("Computing embedding")
X_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X)
print("Done.")

from sklearn.cluster import AgglomerativeClustering

for linkage in ('ward', 'average', 'complete', 'single'):
    clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10)
    t0 = time()
    clustering.fit(X_red)
    print("%s :\t%.2fs" % (linkage, time() - t0))

    plot_clustering(X_red, clustering.labels_, "%s linkage" % linkage)


plt.show()

In [None]:
import pandas as pd
rrx = pd.DataFrame(X_red, columns = ['A', 'B'])
plt.scatter(rrx['A'], rrx['B'])

In [None]:
from sklearn.cluster import OPTICS, cluster_optics_dbscan

In [None]:
for xier in [i for i in np.linspace(0.0, 0.3, num=3)]:
    clust = OPTICS(min_samples=20, xi=xier, min_cluster_size=.05)
    clust.fit(X_red)
    print('\n\n')
    plot_clustering(X_red, clust.labels_, f"OPTICS {xier}")
    plt.show()

In [None]:
### DEFAULT

clust = OPTICS()
clust.fit(X_red)
plot_clustering(X_red, clust.labels_, "OPTICS DEFAULT")
plt.show()