In [None]:
import numpy as np
import pandas as pd
import warnings
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
import time

warnings.filterwarnings('ignore')
%matplotlib inline

## 模型预测的
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor

## 数据降维处理的
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA

import lightgbm as lgb
from lightgbm import LGBMRegressor
import xgboost as xgb

## 参数搜索和评价的
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split, RepeatedKFold

from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
pd.set_option("display.max_columns", None)

# Read in data

In [None]:
Train_data = pd.read_csv('used_car_train_20200313.csv', sep=' ')
Test_data = pd.read_csv('used_car_testB_20200421.csv', sep=' ')

In [None]:
Train_data.head().T

# Data Preprocessing

In [None]:
Train_data['notRepairedDamage'] = Train_data['notRepairedDamage'].replace('-',np.nan)
Train_data['notRepairedDamage'].value_counts()

In [None]:
X = Train_data.drop(['price', 'SaleID', 'name'], axis = 'columns')
Y = Train_data['price']

In [None]:
X.notRepairedDamage = X.notRepairedDamage.astype('float')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
                                    X, Y, 
                                    test_size=0.2,
                                    shuffle=True,
                                    random_state=100)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# Model Building
1. LGBM model
2. XGB model
3. Stacking 2 models

In [None]:
# LGBM model
model_1 = LGBMRegressor()
model_1.fit(X_train, y_train)

In [None]:
# Train data
y_train_1 = model_1.predict(X_train)    
n_scores = cross_val_score(model_1, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('MAE for train: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))


# Test data
y_test_1 = model_1.predict(X_test)
n_scores = cross_val_score(model_1, X_test, y_test, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('MAE for test: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [None]:
# XGB model
model_2 = xgb.XGBRegressor()
model_2.fit(X_train, y_train)

In [None]:
# Train data
y_train_2 = model_2.predict(X_train)    
n_scores = cross_val_score(model_2, X_train, y_train, scoring='neg_mean_absolute_error', n_jobs=-1, error_score='raise')
print('MAE for train: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))


# Test data
y_test_2 = model_2.predict(X_test)
n_scores = cross_val_score(model_2, X_test, y_test, scoring='neg_mean_absolute_error', n_jobs=-1, error_score='raise')
print('MAE for test: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [None]:
def Weighted_method(test_pre1,test_pre2,w=[1/2,1/2]):
    Weighted_result = w[0]*pd.Series(test_pre1)+w[1]*pd.Series(test_pre2)
    return Weighted_result

In [None]:
# Stacking 2 models
print('MAE for train data...')
val_pre_train = Weighted_method(y_train_1,y_train_2)
MAE_Weighted = mean_absolute_error(y_train,val_pre_train)
print('MAE of Weighted of val:',MAE_Weighted)

print('MAE for test data...')
val_pre_test = Weighted_method(y_test_1,y_test_2)
MAE_Weighted = mean_absolute_error(y_test,val_pre_test)
print('MAE of Weighted of val:',MAE_Weighted)

# Create submission data

In [None]:
X_test_sub  = Test_data.drop(['SaleID', 'name'], axis = 'columns')
X_test_sub['notRepairedDamage'] = X_test_sub['notRepairedDamage'].replace('-',np.nan).astype('float')

In [None]:
y_sub_1 = model_1.predict(X_test_sub)
y_sub_2 = model_2.predict(X_test_sub)

In [None]:
sub_pred = Weighted_method(y_sub_1,y_sub_2)
sub = pd.DataFrame()
sub['SaleID'] = X_test_sub.index
sub['price'] = sub_pred
sub.to_csv('./sub_Weighted_baseline.csv',index=False)