In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.impute import KNNImputer
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.feature_selection import mutual_info_regression, f_regression
from sklearn import preprocessing
from sklearn import decomposition
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF,DotProduct, ConstantKernel,WhiteKernel,Matern,RationalQuadratic

# Preprocessing
## 1. Read data from csv file
Use pandas to read csv file. Then discard unnecessary columns (id column). Check the correctness of data reading in the end of the cell.

In [2]:
x_train = pd.read_csv("X_train.csv")
y_train = pd.read_csv("Y_train.csv")
x_test = pd.read_csv("X_test.csv")
# remove the id colum of x_train and x_test
x_train = x_train.iloc[:, 1:]
x_test = x_test.iloc[:, 1:]
# remove the id column of y_train
# can also use drop() funcation
y_train = y_train.iloc[:,1:]

# check whether read data correctly
# print(x_train.shape)
x_train.head(7)

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x822,x823,x824,x825,x826,x827,x828,x829,x830,x831
0,10.891876,832442.812375,20585.544083,1028.369495,1163780.0,9.199135,597900.477629,,1144294.0,785176.201298,...,1024198.0,-855.549602,12176.073427,10.647729,10.916371,1220.065443,8.566724,1036263.0,85338.558539,103088.66421
1,11.512994,832442.898114,,1012.624877,1028911.0,10.906408,597900.458612,8127.016078,1099166.0,785176.258299,...,1086806.0,-787.397942,10493.09566,10.586492,9.463962,917.094909,10.231822,1007163.0,95695.020645,105161.109422
2,11.052185,832442.896307,20585.512844,1003.953827,923175.6,9.212979,597900.426764,10738.092422,1027863.0,785176.223468,...,1018533.0,-906.997242,10959.516944,10.769287,10.34216,637.027802,10.705461,1019955.0,80253.299882,104177.051666
3,11.642076,,,1004.672084,945946.1,9.55342,597900.450367,13524.096973,1168144.0,785176.254867,...,1047017.0,-1011.742516,16845.309819,10.48383,10.594941,1114.06959,10.321063,1085442.0,,102746.51692
4,10.407121,832442.831424,20585.557007,,995718.2,8.419164,597900.423639,12894.065081,1063199.0,785176.19088,...,1031009.0,-1025.223865,18348.46004,,,1230.088215,10.250096,1024812.0,101815.745499,105163.749149
5,9.144461,832442.882921,20585.548624,1042.982264,1079217.0,10.29093,597900.434742,9730.044411,975833.0,785176.248738,...,1020202.0,-790.713766,12838.791399,10.56176,10.386269,1230.071653,11.034434,,90435.964659,109082.14524
6,9.895803,832442.816841,20585.544187,1001.48382,993522.6,10.276043,597900.436187,12080.006126,953770.6,785176.270535,...,1038143.0,-927.126335,14383.974419,10.427384,10.442063,1076.013807,10.228285,1054731.0,100889.137964,106036.091106


## 2. Filling the missing values
After reading data, we could see there are many missing values('NAN') in the dataset. So before we do further processing, we should choose proper methods to fill the missing values.    

Mean value of each column or Median value of each column can be used to do the fulfillment. But here we will take KNN methods to get more accurate outcome.  
  
  Use *KNNImputer* from *sklearn.impute* to fill the missing values('NAN').

In [3]:
imputer = KNNImputer( n_neighbors=10, weights='uniform', metric='nan_euclidean')

x_train = imputer.fit_transform(x_train)
x_train = pd.DataFrame(x_train)
x_test = imputer.fit_transform(x_test)
x_test = pd.DataFrame(x_test)

# x_train = xtrain.fillna(x_train.mean())
# x_test = xtest.fillna(x_train.mean())
# x_train = xtrain.fillna(x_train.median())
# x_test = xtest.fillna(x_train.median())

## 3. Scaling the data
Each column's data varies a lot. To take all data as same weight(since we do not know which colums are useful yet), we should scale the data: mapping each column value to a small range.  
  
  Use MinMaxScaler and StandardScaler from sklearn.preprocessing to do this. We choose to use StandardScaler in our code by trying both.

In [4]:
# select scaling method
scaler = preprocessing.StandardScaler()
# scaler = preprocessing.MinMaxScaler(feature_range = (0,1))
x_train_scaled = scaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train_scaled, columns = x_train.columns)
x_test_scaled = scaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test_scaled, columns = x_test.columns)

# check the data's correctness
x_train.head(7)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,822,823,824,825,826,827,828,829,830,831
0,0.942493,-1.700031,0.677888,-0.776661,1.763823,-0.92342,1.750336,-0.359669,1.477087,-0.900275,...,-0.926136,0.138346,-0.555133,0.357323,0.936976,0.673121,-1.477605,-0.516184,-1.576968,-0.702403
1,1.60823,1.448678,0.039286,-1.353328,0.327574,0.896183,1.04603,-1.397727,1.018262,1.147418,...,1.350061,0.56711,-1.252698,0.133718,-0.58065,-0.712429,0.221062,-1.592495,-0.457268,0.082128
2,1.114318,1.382298,-0.44016,-1.670916,-0.798419,-0.908666,-0.133529,0.223024,0.293307,-0.103843,...,-1.132115,-0.185328,-1.059374,0.801187,0.336981,-1.993236,0.704249,-1.119355,-2.126766,-0.29039
3,1.746586,-0.178166,0.320567,-1.644609,-0.555932,-0.545825,0.740655,1.952357,1.719573,1.024132,...,-0.096514,-0.844314,1.380182,-0.241149,0.601113,0.188379,0.312102,1.302758,0.221515,-0.831924
4,0.422913,-1.000453,1.14045,0.039348,-0.025899,-1.754709,-0.249274,1.561282,0.652576,-1.274522,...,-0.678502,-0.929129,2.003211,-0.41235,-0.481637,0.718957,0.239704,-0.939721,0.20448,0.083127
5,-0.930454,0.890729,0.84042,-0.24145,0.863298,0.24021,0.16194,-0.402693,-0.235687,0.803951,...,-1.071423,0.546249,-0.280449,0.043411,0.383071,0.718881,1.039854,-0.043952,-1.025857,1.566448
6,-0.125137,-1.536023,0.6816,-1.761383,-0.04928,0.224343,0.21546,1.055979,-0.459999,1.58699,...,-0.419166,-0.311966,0.360002,-0.447258,0.44137,0.014342,0.217453,0.166864,0.104299,0.413355


## 4. Feature Selection
  
  There are 832 features(columns) of each data. Among them exists useless or redundant features.Too many features may cause problems such as low-effiency or overfit. It's necessary to reduce features' number.   
  Here are several methods to select features:
  - Filter
  - Wrapper
  - Embedded 
  
<br>We first tried SelectKBest method from sklearn but it doesn't work (cannot distinguish which are useful features because it may choose relevant features and cause overfit). So in our code we finally choose Lasso to pick up features.
<br><br>Lasso methods fit such problems: small sample but relatively many features. L1 regularization adds the L1 norm of the coefficient W to the loss function as a penalty term. Since the regular term is non-zero, this forces the coefficients corresponding to weak features to be zero. Thus L1 regularization tends to make the learned model sparse (the coefficient W is often 0). That will meet our requirements. 

In [5]:
# alpha: the weight of L1 penalty term
coefficients_train = Lasso(alpha = 0.5)
coefficients_train.fit(x_train, y_train)

choose_features = (coefficients_train.coef_ != 0)
# print(coefficient_train.coef_)

x_train = x_train.loc[:, choose_features]
x_test = x_test.loc[:, choose_features]

#check choose how many features
x_train.head()

Unnamed: 0,35,85,129,131,155,184,187,214,227,302,...,575,579,668,685,687,726,761,779,785,789
0,0.764399,-0.387242,0.128673,-1.352998,1.556735,-0.011368,0.180149,0.836654,-1.609875,-0.640106,...,0.616306,-1.539756,1.590178,0.000903,-1.076245,-0.584159,-0.27096,1.778407,1.427256,-0.709121
1,-0.227643,1.930197,-0.078653,0.203304,-3.323633,-1.555959,-1.03632,-0.357919,0.488278,0.642164,...,-0.029395,1.993414,1.521426,-2.408154,-1.248679,0.581506,1.976307,0.170829,1.18966,-1.406099
2,-0.827556,-0.473649,0.161924,1.012361,1.04236,0.032609,0.138262,-0.653667,-0.36063,-0.601899,...,-0.307966,-1.664253,0.809962,-0.178326,1.532533,-0.488529,0.193061,-1.228298,-0.020423,1.200787
3,0.986452,-0.674918,-0.172366,1.725593,3.296079,2.14855,0.203023,-0.692671,2.014332,-0.758568,...,1.206968,0.918562,-1.233334,2.080282,0.281289,5.266438,1.033473,2.903278,-0.884301,0.678749
4,0.738774,-0.58078,0.343361,-0.288739,0.197806,0.736512,1.73688,0.856669,-0.070456,0.103454,...,2.104688,-0.560072,1.743183,0.984172,-0.823479,0.578459,0.374598,0.207123,0.509101,0.553017


## 5.Outlier detection
<br>Use IsolationForest from sklearn to do isolation forest methods. Detect and remove outliers.

In [6]:
# create IsolationForest class
isofore = IsolationForest(n_estimators = 600,max_features = 29, contamination = 0.015)

"""
IsolationForest(*, n_estimators=100, max_samples='auto', contamination='auto', max_features=1.0, bootstrap=False, n_jobs=None, random_state=None, verbose=0, warm_start=False)
"""
x_train_isoforest = isofore.fit_predict(x_train)

# Remove outliers
x_train = x_train[(x_train_isoforest != -1)]
y_train = y_train[(x_train_isoforest != -1)]

# print(x_train)

# Regression
<br> Use processed data to train and predict the 'X_test.csv'.

In [7]:
# Step 5 - Model Fit: XGBoost

regression = xgb.XGBRegressor(objective="reg:squarederror", random_state=48)

# Use 5-fold Cross Validation to See the performance and tune the parameter

cv_means = []
cv_stds = []
for i in np.arange(10):
    scores = cross_val_score(estimator = regression,
                                 X = x_train,
                                 y = y_train,
                                 scoring = 'r2',
                                 cv = KFold(n_splits=5, shuffle = True))
    cv_means.append(np.mean(scores))
    cv_stds.append(np.std(scores))

print("Average of R2 scores:", np.mean(cv_means))
print("Standard deviation of R2 scores:", np.mean(cv_stds))

Average of R2 scores: 0.5271123378999729
Standard deviation of R2 scores: 0.04866950493930913


# Prediction and Write into csv file

In [9]:
regression = xgb.XGBRegressor(objective = "reg:squarederror", random_state = 48)
regression.fit(x_train, y_train)

y_pred = regression.predict(x_test)

#print(y_pred)
prediction_results = pd.DataFrame(data = y_pred,columns = ['y'])
# Using DataFrame.insert() to add a column
index = [i for i in range(len(prediction_results))]
prediction_results.insert(0,"id",index)
  
# Observe the result
prediction_results
prediction_results.to_csv('result_xgb.csv',index = False)