# Python для Data Science: Итоговый проект

In [139]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime 
from sklearn import ensemble
from matplotlib import style
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score

style.use('fivethirtyeight')
%matplotlib inline

TEST_DATASET_PATH = 'test.csv'
TRAIN_DATASET_PATH = 'train.csv'

test_data = pd.read_csv(TEST_DATASET_PATH)
train_data = pd.read_csv(TRAIN_DATASET_PATH)

In [140]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
dtypes: float64(7), int64(9), object(3)
memory usage: 742.3+ KB


## Заполняем пропуски

In [141]:
train_data = train_data.sort_values('Price')
test_data = test_data.sort_values('DistrictId')
train_data = train_data.fillna(method='pad')
test_data = test_data.fillna(method='pad')

## Создаем и обучаем модель

In [142]:
X_train = pd.get_dummies(train_data)
X_train.drop("Price", axis=1, inplace=True)
X_train.drop("Id", axis=1, inplace=True)
y_train = train_data.Price

model1 = ensemble.GradientBoostingRegressor(n_estimators=442, max_depth=5, min_samples_split=3,
                                           learning_rate=0.1, loss='ls', random_state=45)
model2 = RandomForestRegressor(n_estimators=1500, max_depth=20, random_state=42, max_features=7)
model3 = LGBMRegressor(max_depth=7,
                             min_samples_leaf=10,
                             n_estimators=400,
                             random_state=42)

model = VotingRegressor([('model1', model1), ('model2', model2), ('model3', model3)])


In [143]:
cv_score = cross_val_score(model, X_train, y_train, 
                           scoring='r2', 
                           cv=KFold(n_splits=5, shuffle=True, random_state=42))
# cv_score
mean = cv_score.mean()
std = cv_score.std()

print('R2: {:.3f} +- {:.3f}'.format(mean, std))

R2: 0.759 +- 0.019


In [144]:
model.fit(X_train, y_train)

VotingRegressor(estimators=[('model1',
                             GradientBoostingRegressor(alpha=0.9,
                                                       criterion='friedman_mse',
                                                       init=None,
                                                       learning_rate=0.1,
                                                       loss='ls', max_depth=5,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=3,
                                                       min_weight_fraction_leaf=0.0,
                                               

## Предсказываем цены и выгружаем в файл

In [145]:
X_test = pd.get_dummies(test_data)
X_test.drop("Id", axis=1, inplace=True)
test_data["Price"] = model.predict(X_test)
test_data.loc[:, ['Id', 'Price']].to_csv('final_project_SS.csv', index=False)