# First analysis of the Mercedes training data set

In [None]:
# Importing main packages and settings
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
pd.set_option('display.max_columns', 50)

In [None]:
# Loading the training dataset
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

In [None]:
# first view of the training data set
df_train.head()

In [None]:
# additional information about the training data set
print(df_train.info())
print(df_train.dtypes)

In [None]:
# analysis of the object features of the training data set
object_features = df_train.select_dtypes(include=[np.object])
object_features.describe()

In [None]:
# analysis of the object features of the test data set
# note the different number of unique values compared to the training set
object_features_test = df_test.select_dtypes(include=[np.object])
object_features_test.describe()

In [None]:
# analysis of the numerical features of the training data set
numeric_features = df_train.select_dtypes(include=[np.number])
numeric_features.describe()

In [None]:
# turning object features into dummy variables
df_train_dummies = pd.get_dummies(df_train, drop_first=True)
df_test_dummies = pd.get_dummies(df_test, drop_first=True)

# dropping ID and the target variable
df_train_dummies = df_train_dummies.drop(['ID','y'], axis=1)
df_test_dummies = df_test_dummies.drop('ID', axis=1)

print("Clean Train DataFrame With Dummy Variables: {}".format(df_train_dummies.shape))
print("Clean Test DataFrame With Dummy Variables: {}".format(df_test_dummies.shape))

In [None]:
# concatenate to only include columns in both data sets
# the number should be based on the number of columns. Original is 30471. Now set to 15471 after outlier handling etc.
df_temp = pd.concat([df_train_dummies, df_test_dummies], join='inner')
df_temp_train = df_temp[:len(df_train.index)]
df_temp_test = df_temp[len(df_train.index):]

# check shapes of combined df and split out again
print(df_temp.shape)
print(df_temp_train.shape)
print(df_temp_test.shape)

In [None]:
# defining X and y
X = df_temp_train
test_X = df_temp_test
y = df_train['y']

In [None]:
# Import the relevant sklearn packages
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_selection import SelectFromModel, VarianceThreshold, SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LassoCV, RidgeCV, ElasticNetCV
from sklearn.metrics import mean_squared_error

# First GBR tests on full data set

In [None]:
# instantiating
gbr = GradientBoostingRegressor()

# setting up steps for the pipeline, with and without imputating
steps = [('GradientBoostingRegressor', gbr)]

# instantiating the pipeline
pipe = Pipeline(steps)

# creating train ang test sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# fitting and predicting
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# Compute and print R^2 and RMSE
print("R^2: {}".format(pipe.score(X_test, y_test)))
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: {}".format(mse))

In [None]:
# Compute 5-fold cross-validation scores: cv_scores
cv_scores_dummies = cross_val_score(pipe, X, y, cv=5)

# Print the 5-fold cross-validation scores
print(cv_scores_dummies)

print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_dummies)))

# Removing features with a low amount of variation

In [None]:
# Fitting a feature selector
def feature_selection(data):
    selector = VarianceThreshold(.98 * (1 - .98))
    selector.fit(data)
    return selector
 
#Learn the features to filter from train set
fs = feature_selection(X)
 
#Transform train and test subsets
X_transformed = fs.transform(X)
test_X_transformed = fs.transform(test_X)

print(X_transformed.shape)
print(test_X_transformed.shape)

In [None]:
# instantiating
gbr = GradientBoostingRegressor()

# setting up steps for the pipeline, with and without imputating
steps = [('GradientBoostingRegressor', gbr)]

# instantiating the pipeline
pipe = Pipeline(steps)

# creating train ang test sets using train_test_split
X_transformed_train, X_transformed_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)

# fitting and predicting
pipe.fit(X_transformed_train, y_train)
y_pred = pipe.predict(X_transformed_test)

# Compute and print R^2 and RMSE
print("R^2: {}".format(pipe.score(X_transformed_test, y_test)))
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: {}".format(mse))

In [None]:
# Compute 5-fold cross-validation scores: cv_scores
cv_scores_dummies = cross_val_score(pipe, X_transformed, y, cv=5)

# Print the 5-fold cross-validation scores
print(cv_scores_dummies)

print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_dummies)))

# Selecting only kBest features
Not working although it did work when I wasn't yet using dummy variables for the object features. Need to look into.

In [None]:
skb = SelectKBest(f_regression, k=50)

#Learn the features to filter from train set
skb.fit(X, y)

# transform the data sets
X_transformed_kbest = skb.transform(X)
test_X_transformed_kbest = skb.transform(test_X)

print(X_transformed_kbest.shape)
print(test_X_transformed_kbest.shape)

## Lasso and LassoCV

In [None]:
# instantiating
las = Lasso(alpha=0.1)

# setting up steps for the pipeline, with and without imputating
steps = [('Lasso', las)]

# instantiating the pipeline
pipe = Pipeline(steps)

# creating train ang test sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# fitting and predicting
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# Compute and print R^2 and RMSE
print("R^2: {}".format(pipe.score(X_test, y_test)))
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: {}".format(mse))

In [None]:
# initiating
lscv = LassoCV()

# setting up steps for the pipeline, with and without imputating
steps = [('LassoCV', lscv)]

# instantiating the pipeline
pipe = Pipeline(steps)

# creating train ang test sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# fitting and predicting
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# Compute and print R^2 and RMSE
print("R^2: {}".format(pipe.score(X_test, y_test)))
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: {}".format(mse))

## Ridge and RidgeCV

In [None]:
# instantiating
rid = Ridge()

# setting up steps for the pipeline, with and without imputating
steps = [('Ridge', rid)]

# instantiating the pipeline
pipe = Pipeline(steps)

# creating train ang test sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# fitting and predicting
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# Compute and print R^2 and RMSE
print("R^2: {}".format(pipe.score(X_test, y_test)))
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: {}".format(mse))

In [None]:
# instantiating
rcv = RidgeCV()

# setting up steps for the pipeline, with and without imputating
steps = [('RidgeCV', rcv)]

# instantiating the pipeline
pipe = Pipeline(steps)

# creating train ang test sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# fitting and predicting
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# Compute and print R^2 and RMSE
print("R^2: {}".format(pipe.score(X_test, y_test)))
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: {}".format(mse))

## ElasticNet and ElasticNet CV

In [None]:
# instantiating
els = ElasticNet()

# setting up steps for the pipeline, with and without imputating
steps = [('ElasticNet', els)]

# instantiating the pipeline
pipe = Pipeline(steps)

# creating train ang test sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# fitting and predicting
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# Compute and print R^2 and RMSE
print("R^2: {}".format(pipe.score(X_test, y_test)))
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: {}".format(mse))

In [None]:
# instantiating
elcv = ElasticNetCV()

# setting up steps for the pipeline, with and without imputating
steps = [('ElasticNetCV', elcv)]

# instantiating the pipeline
pipe = Pipeline(steps)

# creating train ang test sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# fitting and predicting
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# Compute and print R^2 and RMSE
print("R^2: {}".format(pipe.score(X_test, y_test)))
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: {}".format(mse))

In [None]:
df_columns = df_train.columns

# Instantiate a lasso regressor: lasso
lasso = Lasso(alpha=0.4, normalize=True)

# Fit the regressor to the data
lasso.fit(X, y)

# Compute and print the coefficients
lasso_coef = lasso.coef_
# print(lasso_coef)

# Plot the coefficients
plt.plot(range(len(df_columns)), lasso_coef)
plt.xticks(range(len(df_columns)), df_columns.values, rotation=60)
plt.margins(0.02)
plt.show()

# Regularized regression tests

# All the below is older testing work on feature selection

In [None]:
X1 = df_train.drop(['ID', 'y'], axis=1)
X1 = X.select_dtypes(include=[np.number])

In [None]:
X1_test = df_test.drop(['ID'], axis=1)
X1_test = X1_test.select_dtypes(include=[np.number])

In [None]:
y = df_train['y']

In [None]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

In [None]:
# Fitting a feature selector
def feature_selection(data):
    selector = VarianceThreshold(.95 * (1 - .95))
    selector.fit(data)
    return selector
 
#Learn the features to filter from train set
fs = feature_selection(X1)
 
#Transform train and test subsets
X1_transformed = fs.transform(X1)
X1_test_transformed = fs.transform(X1_test)

print(X1_transformed.shape)
print(X1_test_transformed.shape)

In [None]:
# Fitting a feature selector
def feature_selection(data):
    selector = VarianceThreshold(.95 * (1 - .95))
    selector.fit(data)
    return selector
 
#Learn the features to filter from train set
fs = feature_selection(X1)
 
#Transform train and test subsets
X1_transformed = fs.transform(X1)
X1_test_transformed = fs.transform(X1_test)

print(X1_transformed.shape)
print(X1_test_transformed.shape)

In [None]:
skb = SelectKBest(f_regression, k=30)

skb.fit(X1_transformed, y)
X1_transformed_kbest = skb.transform(X1_transformed)
X1_test_transformed_kbest = skb.transform(X1_test_transformed)

print(X1_transformed_kbest.shape)
print(X1_test_transformed_kbest.shape)