In [1]:
import warnings
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures
import reverse_geocoder as rg
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

In [2]:
warnings.filterwarnings("ignore")

In [3]:
features = pd.read_csv('features.csv')
prices = pd.read_csv('input_data.csv', delimiter=';').rename(columns={'geo_lat': "lat", 'geo_lon': "lon"})
train = pd.read_csv('train.csv').drop('id', axis=1)
test = pd.read_csv('test.csv')

Используя координаты найдём ближайшие города

In [4]:
val = train[['lat','lon']].values
coord = [tuple(el) for el in val]
df = pd.DataFrame(rg.search(coord))
train['city'] = df['admin1']

val = test[['lat','lon']].values
coord = [tuple(el) for el in val]
df = pd.DataFrame(rg.search(coord))
test['city'] = df['admin1']

Loading formatted geocoded file...


Кластеризуем данные по координатам, чтобы после этого найти среднее по остальным признакам и применить к train и test.

In [5]:
kmeans = KMeans(
    n_clusters=110,
    n_init=10,
    random_state=42
)
kmeans.fit(features[['lat', 'lon']])
labels = kmeans.labels_
features['cluster'] = labels

In [6]:
grouped = features.groupby('cluster')
features = grouped.mean().drop(['lat', 'lon'], axis = 1)

In [7]:
train['cluster'] = kmeans.predict(train[['lat', 'lon']])
test['cluster'] = kmeans.predict(test[['lat', 'lon']])
prices['cluster'] = kmeans.predict(prices[['lat', 'lon']])

In [8]:
df = prices[['cluster', 'price']]
grouped = df.groupby('cluster')
prices = grouped.mean()
features['price'] = prices['price']

In [9]:
train = pd.merge(train,features, on='cluster', how='left').drop(['cluster'], axis=1)
test = pd.merge(test,features, on='cluster', how='left').drop(['cluster'], axis=1)

Создадим ещё несколько фичей, основаваясь на координатах

In [10]:
coordinates = train[['lat','lon']].values
pca_obj = PCA().fit(coordinates)
train['pca1'] = pca_obj.transform(train[['lat','lon']])[:,0]
train['pca2'] = pca_obj.transform(train[['lat','lon']])[:,1]
train['rot45_1'] = (0.707 * train['lat']) + (0.707 * train['lon'])
train['rot45_2'] = (0.707 * train['lon']) + (0.707 * train['lat'])
train['rot30_1'] = (0.866 * train['lat']) + (0.5 * train['lon'])
train['rot30_2'] = (0.866 * train['lon']) + (0.5 * train['lat'])

In [11]:
coordinates = train[['lat','lon']].values
pca_obj = PCA().fit(coordinates)
test['pca1'] = pca_obj.transform(test[['lat','lon']])[:,0]
test['pca2'] = pca_obj.transform(test[['lat','lon']])[:,1]
test['rot45_1'] = (0.707 * test['lat']) + (0.707 * test['lon'])
test['rot45_2'] = (0.707 * test['lon']) + (0.707 * test['lat'])
test['rot30_1'] = (0.866 * test['lat']) + (0.5 * test['lon'])
test['rot30_2'] = (0.866 * test['lon']) + (0.5 * test['lat'])

In [12]:
train = train.drop(['city', 'lat', 'lon'], axis=1)
test = test.drop(['city', 'lat', 'lon'], axis=1)

In [13]:
scaler = MinMaxScaler()
scaler.fit_transform(train);
scaler.fit_transform(test);

In [14]:
test.to_csv('new_test.csv', index=False)
train.to_csv('new_train.csv', index=False)