>## <font color=darkcyan> EDA and Property Segmentation:</font>
>### Preliminary EDA and clustering for property data. The things i didn't do are:
 -  More feat eng for kmeans
 -  k-means results are sensitive to the order of observations, and it is worth to run algorithm several times, shuffling data in between, averaging resulting clusters and running 
    final evaluations with those averaged clusters centers as starting points. i run kmeans only once.
 -  Advanced outlier removal (used visuals)
 -  Tune DBSCAN
 -  Final descriptive analysis of clusters
 -  Didn't apply advanced technique for imputing missing values



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from IPython.display import display, HTML

home = pd.read_csv('../input/property-data/property.csv')
display(home.head())

print(home.shape)

missing = home.isnull().sum()
missing = missing[missing>0]
display(missing)
plt.style.use('fivethirtyeight')
plt.figure(figsize=(12, 5))
plt.title('Missing Vals Dists')
missing.plot.bar(color='teal')


>## <font color=darkcyan> Agile EDA</font>

In [None]:
from scipy.stats import norm, beta
sns.set()
plt.style.use('seaborn-poster')

plt.figure(figsize=(10, 4))
sns.distplot(home['monthly_rent'] , bins=80, kde=True, hist=True, fit=norm, color = 'teal');

In [None]:


plt.figure(figsize=(10, 4))
sns.distplot(home['unit_area'] , bins=80, kde=True, hist=True, fit=norm, color = 'teal');

In [None]:


plt.figure(figsize=(10, 5))
sns.distplot(home['deposit'] , bins=80, kde=True, hist=True, fit=norm, color = 'teal');

In [None]:

plt.figure(figsize=(10, 5))
sns.distplot(home['property_age'] , bins=80, kde=True, hist=True, fit=norm, color = 'teal');

In [None]:
home.describe()

### Removing Noise

In [None]:
home.drop(list(home[home.unit_area <20].index), inplace=True)
home.reset_index(drop=True, inplace=True)
home.drop(list(home[home.property_age >55].index), inplace=True)
home.reset_index(drop=True, inplace=True)
home.shape

In [None]:
cats = ['district_uuid', 'has_elevator', 'has_storage_area']

from sklearn.preprocessing import *
for c in cats:
    le = LabelEncoder()
    home[c] =le.fit_transform(home[c].astype('str'))


In [None]:
feats = [c for c in home.columns if c not in ['item_id']]
feats

In [None]:
home_orig = home.copy()

del home['item_id']

home.fillna(home.mean(), inplace=True)
rb = StandardScaler()
home = rb.fit_transform(home)
home = pd.DataFrame(home, columns=feats)
home.head()

>## <font color=SkyBlue> DBSCAN</font>


In [None]:
from sklearn.cluster import DBSCAN

clustering = DBSCAN(eps=3, min_samples=2).fit(home)
clustering.labels_

In [None]:
home['ClusterDBSCAN'] = pd.Series(clustering.labels_)


print(home.ClusterDBSCAN.unique().size)

home[home.ClusterDBSCAN==-1]

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
X = home.copy()
Xtsne = TSNE(n_components=2).fit_transform(X)
dftsne = pd.DataFrame(Xtsne)
dftsne['cluster'] = clustering.labels_
dftsne.columns = ['x1','x2','cluster']

pca2 = PCA(n_components=2)
skillsPCA2 = pca2.fit_transform(home)
dfskillsPCA2 = pd.DataFrame(skillsPCA2)
dfskillsPCA2['cluster'] = clustering.labels_
dfskillsPCA2.columns = ['x1','x2','cluster']

fig, ax = plt.subplots(1, 2, figsize=(30,15))
sns.scatterplot(data=dftsne,x='x1',y='x2',hue='cluster',legend="full",alpha=0.5,palette="Set1", ax=ax[0])
ax[0].set_title('Visualized on TSNE 2D')
sns.scatterplot(data=dfskillsPCA2,x='x1',y='x2',hue='cluster',legend="full",alpha=0.5,palette="Set1",ax=ax[1])
ax[1].set_title('Visualized on PCA 2D')
fig.suptitle('Comparing clustering result when visualized using TSNE2D vs. PCA2D')
#display(fig)

In [None]:
home.shape, home_orig.shape

In [None]:
home.head()

In [None]:
home_orig.head()

In [None]:
lr = home[home.ClusterDBSCAN==-1].index
home.drop(list(home[home.ClusterDBSCAN==-1].index), inplace=True)
home_orig.drop(list(lr), inplace=True)
home.reset_index(drop=True, inplace=True)
home_orig.reset_index(drop=True, inplace=True)
home.shape, home_orig.shape
del home['ClusterDBSCAN']
lr2 = home[home.monthly_rent>15].index
home.drop(list(home[home.monthly_rent>15].index), inplace=True)
home_orig.drop(list(lr2), inplace=True)
home.to_csv('home_noiseRem.csv', index=False)
home.shape, home_orig.shape

In [None]:
import matplotlib.style as style
style.use('ggplot')
sns.set_style('whitegrid')
plt.subplots(figsize = (14,10))

# Generate a mask for the upper triangle (taken from seaborn example gallery)
mask = np.zeros_like(home.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True


sns.heatmap(home.corr(), cmap=sns.diverging_palette(20, 220, n=200), annot=True, mask=mask, center = 0, );
## Give title. 
plt.title("Heatmap of all the Features", fontsize = 25);


>## <font color=teal> GMM</font>


>#### **Note:**
>#### Though GMM is often categorized as a clustering algorithm, fundamentally it is an algorithm for density estimation. 
>#### That is to say, the result of a GMM fit to some data is technically not a clustering model, but a generative probabilistic
>#### model describing the distribution of the data. I used it as a novel method to use result to kmeans!


In [None]:
home.tail()

In [None]:
home_orig.tail()

In [None]:
from sklearn.mixture import GaussianMixture as GMM

n_components = np.arange(40, 60)
models = [GMM(n, covariance_type='full', random_state=0).fit(home)
          for n in n_components]
plt.style.use('ggplot')
plt.figure(figsize=(10, 5))
plt.plot(n_components, [m.bic(home) for m in models], label='BIC')
plt.plot(n_components, [m.aic(home) for m in models], label='AIC')
plt.legend(loc='best')
plt.xlabel('n_components');

In [None]:

gmm = GMM(n_components=51, covariance_type='diag', max_iter=300, random_state=42).fit(home)
labels = gmm.predict(home)


home_orig['Cluster_GMM'] = list(labels)
home_orig.head()

In [None]:
labels

In [None]:
cCs = home_orig.groupby('Cluster_GMM')['item_id'].count()
cCs

In [None]:
home_orig.tail()

In [None]:
home.shape, home_orig.shape

In [None]:
home['Cluster_GMM'] = list(labels)
home.head()

In [None]:
home.to_csv('home_w_gmmcluster.csv', index=False)

In [None]:
del home['Cluster_GMM']

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
X = home.copy()
Xtsne = TSNE(n_components=2).fit_transform(X)
dftsne = pd.DataFrame(Xtsne)
dftsne['cluster'] = labels
dftsne.columns = ['x1','x2','cluster']

pca2 = PCA(n_components=2)
skPCA2 = pca2.fit_transform(home)
dfskPCA2 = pd.DataFrame(skPCA2)
dfskPCA2['cluster'] = labels
dfskPCA2.columns = ['x1','x2','cluster']

fig, ax = plt.subplots(1, 2, figsize=(35,22))
sns.scatterplot(data=dftsne,x='x1',y='x2',hue='cluster',legend="full",alpha=0.5,palette="Set2", ax=ax[0])
ax[0].set_title('Visualized on TSNE 2D')
sns.scatterplot(data=dfskillsPCA2,x='x1',y='x2',hue='cluster',legend="full",alpha=0.5,palette="Set2",ax=ax[1])
ax[1].set_title('Visualized on PCA 2D')
fig.suptitle('Comparing clustering result when visualized using TSNE2D vs. PCA2D')
#display(fig)


>## <font color=darkcyan> KMeans (with density estimation feature!)</font>


>#### **Note:**
**k-means results are sensitive to the order of observations, and it is worth to run algorithm several times, shuffling data in between, averaging 
resulting clusters and running final evaluations with those averaged  clusters centers as starting points. 
I ran kmeans only once.**





In [None]:
home =pd.read_csv('home_w_gmmcluster.csv')
home.head()

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=4).fit(home)
pca_samples = pca.transform(home)
pca.fit(home)


#cumulative explaned variance
print (np.cumsum(pca.explained_variance_ratio_))

In [None]:
pca = PCA(n_components=4).fit(home)
reduced_data = pca.transform(home)



In [None]:

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

#keep the scores for each cluster size
sil_scores = []

random_state = 7

for i in range(35,17,-1):
    clusterer = KMeans(i, random_state=random_state).fit(reduced_data)
    # TODO: Predict the cluster for each data point
    preds = clusterer.predict(reduced_data)

    # TODO: Find the cluster centers
    centers = clusterer.cluster_centers_

    # TODO: Predict the cluster for each transformed sample data point
   # sample_preds = clusterer.predict(home)

    # TODO: Calculate the mean silhouette coefficient for the number of clusters chosen
    score = silhouette_score(reduced_data, preds)
    sil_scores.append(score)
    print(i, 'clusters:', score.round(5))

# plot the scores
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 7))
plt.style.use("fivethirtyeight")
_ = plt.plot(np.arange(35,17,-1), sil_scores, '-o')

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn import preprocessing

km = KMeans(init='random', n_clusters=21, verbose=0,
            n_init=1, max_iter=200,)
km.fit(reduced_data)

l = km.labels_

print("LABLES")
print(l)



In [None]:
home_orig['Cluster_Km'] = list(pd.Series(l))
home_orig.head(20)

In [None]:
cCs = home_orig.groupby('Cluster_Km')['item_id'].count()
cCs

In [None]:
home_orig.head()

In [None]:
home_orig.to_csv('home_orig_clusters_full.csv')

In [None]:
from sklearn.manifold import TSNE
X = home.copy()
Xtsne = TSNE(n_components=2).fit_transform(X)
dftsne = pd.DataFrame(Xtsne)
dftsne['cluster'] = km.labels_
dftsne.columns = ['x1','x2','cluster']
plt.figure(figsize=(15,10))
plt.title('KMeans Result')
sns.scatterplot(data=dftsne,x='x1',y='x2',hue='cluster',legend="full",alpha=0.5)
del X

## Birch


In [None]:
from sklearn.cluster import Birch

#from sklearn.mixture import GMM
birch = Birch(threshold=.5, branching_factor=50, n_clusters=20).fit(home)
birch_labels = birch.predict(home)
#plt.scatter(home.iloc[:, 2], home.iloc[:, 8], c=labels, s=40, cmap='viridis');

In [None]:
birch_labels

In [None]:
from sklearn.manifold import TSNE
X = home.copy()
Xtsne = TSNE(n_components=2).fit_transform(X)
dftsne = pd.DataFrame(Xtsne)
dftsne['cluster'] = birch_labels
dftsne.columns = ['x1','x2','cluster']
plt.figure(figsize=(15,10))
plt.title('Birch Result')
sns.scatterplot(data=dftsne,x='x1',y='x2',hue='cluster',legend="full",palette='bone', alpha=0.5)
del X