In [None]:
import csv
import os
import importlib

import pandas as pd
import numpy as np
import datetime
import matplotlib.pylab as plt
import seaborn as sns; sns.set()

import utils 
import my_transformers
import const

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.base import BaseEstimator, TransformerMixin

%matplotlib inline

In [None]:
importlib.reload(utils)

df = utils.PrepareData().df
df.head()

In [None]:
importlib.reload(my_transformers)
from my_transformers import DropColumns, YearTransformer, ColumnToDateFormat, Drop33Rooms

transform_pipeline = Pipeline([
        ('yr_built_transformer', YearTransformer(column='yr_built')),
        ('33_bedrooms_row_drop', Drop33Rooms()),
        ('clean_flaw', DropColumns(columns=['id', 'date', 'yr_renovated'])),
        # ('clean_flaw', DropColumns(columns=['bathrooms'])),
        # ('clean_geodata', DropColumns(columns=['zipcode', 'lat', 'long'])),
        ('clean_duplicate', DropColumns(columns=['price'])),
     ])
ndf = transform_pipeline.transform(df)
ndf.head()

In [None]:
ndf.info()

In [None]:
f, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))

ndf['price_bin'].hist(ax=ax[0])
ax[0].set_title('all data')

sdf['price_bin'].hist(ax=ax[1])
ax[1].set_title('filtered data')

plt.show()


In [None]:
sdf = utils.get_equal_samples(ndf, column='price_bin', mult_coef=1)
sdf['price_bin'].value_counts()

In [None]:
ndf['price_bin'].value_counts()

In [None]:
f, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
f.suptitle('waterfromt distribution')
ndf['waterfront'].hist(ax=ax[0], label='all sales')
ndf[ndf['price_bin']==1]['waterfront'].hist(ax=ax[0], label='over 1m sales')
ax[0].set_title('all data')
ax[0].legend()

sdf['waterfront'].hist(ax=ax[1], label='all sales')
sdf[ndf['price_bin']==1]['waterfront'].hist(ax=ax[1], label='over 1m sales')
ax[1].set_title('filtered data')
ax[1].legend()

plt.show()

In [None]:
f, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
f.suptitle('view distribution')
ndf['view'].hist(ax=ax[0], label='all sales')
ndf[ndf['price_bin']==1]['view'].hist(ax=ax[0], label='over 1m sales')
ax[0].set_title('all data')
ax[0].legend()

sdf['view'].hist(ax=ax[1], label='all sales')
sdf[ndf['price_bin']==1]['view'].hist(ax=ax[1], label='over 1m sales')
ax[1].set_title('filtered data')
ax[1].legend()

plt.show()

In [None]:
f, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
f.suptitle('condition value distribution')
ndf['condition'].hist(ax=ax[0], label='all sales')
ndf[ndf['price_bin']==1]['condition'].hist(ax=ax[0], label='over 1m sales')
ax[0].set_title('all data')
ax[0].legend()

sdf['condition'].hist(ax=ax[1], label='all sales')
sdf[ndf['price_bin']==1]['condition'].hist(ax=ax[1], label='over 1m sales')
ax[1].set_title('filtered data')
ax[1].legend()

plt.show()

In [None]:
train_X, test_X, train_y, test_y = train_test_split(
    sdf.drop(columns=['price_bin'], inplace=False), 
    sdf['price_bin'],
    test_size=const.TEST_SIZE,
    random_state=const.RANDOM_STATE
    )

' - '.join([str(getattr(v, 'shape')) for v in (train_X, test_X, train_y, test_y)])

# Logical Regression

In [None]:
lr = LogisticRegression(multi_class='multinomial',
                        solver='lbfgs',
                        C=8)
lr.fit(train_X, train_y)

In [None]:
test_y_pred = lr.predict(test_X)
confusion_matrix(test_y, test_y_pred)

In [None]:
train_y_pred = cross_val_predict(lr, train_X, train_y, cv=2)
confusion_matrix(train_y, train_y_pred)

In [None]:
param_grid = {
    'multi_class': ['auto', 'ovr', 'multinomial' ],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'C': [3, 5, 8, 10, 12]
    }
grid = GridSearchCV(estimator=lr,
                    param_grid=param_grid,
                    scoring='r2',
                    verbose=1,
                    n_jobs=-1)
                    
grid_result = grid.fit(train_X, train_y)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

In [None]:
sgd = SGDClassifier(random_state=const.RANDOM_STATE)
sgd.fit(train_X, train_y)

In [None]:
cross_val_score(sgd, train_X, train_y, cv=4, scoring='accuracy')

In [None]:
train_y_pred = cross_val_predict(sgd, train_X, train_y, cv=5)
precision, recall , threshold = precision_recall_curve(train_y, train_y_pred)

plt.plot(threshold, precision[:-1], 'b', label='precision')
plt.plot(threshold, recall[:-1], 'g', label='recall')
plt.xlabel('threshold')
plt.ylim(0, 1)
plt.show()


# KNN

In [None]:
from sklearn import neighbors, metrics, svm
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

In [None]:
knn = neighbors.KNeighborsClassifier(n_neighbors=2, weights='uniform')
knn.fit(train_X, train_y)
predict = knn.predict(test_X)
accuracy = metrics.accuracy_score(test_y, predict)
accuracy

In [None]:
conf_mat = confusion_matrix(test_y, predict)
conf_mat

# SVM

In [None]:
_svm = svm.SVC()
_svm.fit(train_X, train_y)
prediction = _svm.predict(test_X)

accuracy = metrics.accuracy_score(test_y, prediction)
print("predictions:", prediction)
print("accuracy: ", accuracy)

In [None]:
conf_mat = confusion_matrix(test_y, predict)
conf_mat

# PCA

In [None]:
sdf.columns

In [None]:
model = PCA(n_components=2)            
model.fit(train_X)                  
X_2D = model.transform(sdf.drop(columns=['price_bin'])) 

In [None]:
df_pca = sdf.copy()
df_pca['PCA1'] = X_2D[:, 0]
df_pca['PCA2'] = X_2D[:, 1]
sns.lmplot("PCA1", "PCA2", hue='price_bin', data=df_pca, fit_reg=True)

In [None]:
model = PCA(n_components=3)            
model.fit(train_X)                  
X_3D = model.transform(sdf.drop(columns=['price_bin'])) 
df_pca['PCA_1'] = X_3D[:, 0]
df_pca['PCA_2'] = X_3D[:, 1]
df_pca['PCA_3'] = X_3D[:, 2]


In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(6,6))

ax = Axes3D(fig)

ax.scatter(df_pca['PCA_1'],
           df_pca['PCA_2'],
           df_pca['PCA_3'],
           c=df_pca['price_bin'],
           marker='o',
           cmap='jet')
           
ax.set_xlabel('PCA1 Label')
ax.set_ylabel('PCA2 Label')
ax.set_zlabel('PCA3 Label')

plt.show()