<a href="https://colab.research.google.com/github/smithj27/Tech-Fundamentals-Project/blob/master/Code-For-Project/Machine_Learning_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#Importing all libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import f_regression
from sklearn.linear_model import Lasso

In [0]:
#Import files into code
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [0]:
#Check for constant columns
toremove = []
for items in train.columns:
  if items != "ID" and items != "target":
    if train[items].std() == 0:
      toremove.append(items)

#Remove constant columns from datasets
train.drop(toremove, axis=1, inplace=True)
test.drop(toremove, axis=1, inplace=True)

In [0]:
#Check for duplicate columns
def duplicates(df):
  x = df.columns.to_series().groupby(df.dtypes).groups
  duplicate = []
  
  for a, b in x.items():
    y = df[b].columns
    z = df[b]
    length = len(y)
    
    for i in range(length):
      g = z.iloc[:, i].values
      for h in range(i+1, length):
        j = z.iloc[:, h].values
        if np.array_equal(g, j):
          duplicate.append(y[i])
          break
  
  return duplicate

toremove = duplicates(train)

In [0]:
#Removing duplicate columns from datasets
train.drop(toremove, axis=1, inplace=True)
test.drop(toremove, axis=1, inplace=True)

In [0]:
#Removes sparse data from datasets
def sparse_remove(df1, df2):
  list1 = [x for x in df1.columns if not x in ["ID", "target"]]
  for items in list1:
    if len(np.unique(df1[items])) < 2:
      df1.drop(items, axis=1, inplace=True)
      df2.drop(items, axis=1, inplace=True)
  return df1, df2

train, test = sparse_remove(train, test)

In [0]:
#Preparing data for modeling
X_train = train.drop(["ID", "target"], axis=1)
Y_train = np.log1p(train["target"].values)
x_test = test.drop(["ID"], axis=1)

In [0]:
devel_X, val_X, devel_y, val_y = train_test_split(X_train, Y_train, test_size = 0.2, random_state=44)

In [0]:
#Preparing LightGBM Model
def run_lgb(train_X, train_y, val_X, val_y, test_X):
  params = {
      "objective": "regression",
      "metric": "rmse",
      "num_leaves": 42,
      "learning_rate": .01,
      "bagging_fraction": .75,
      "feature_fraction": .75,
      "bagging_frequency": 8,
      "bagging_seed": 42,
      "verbosity": -1,
      "seed": 42
  }
  
  lgtrain = lgb.Dataset(train_X, label=train_y)
  lgval = lgb.Dataset(val_X, label=val_y)
  eval_result = {}
  model = lgb.train(params, lgtrain, 5000, valid_sets=[lgtrain, lgval],
                   early_stopping_rounds = 120, verbose_eval = 150, 
                   evals_result = eval_result)
  pred_test_y = np.expm1(model.predict(test_X, num_iteration=model.best_iteration))
  return pred_test_y, model, eval_result

In [0]:
#Run LightGBM Training
pred_test, model, eval_result = run_lgb(devel_X, devel_y, val_X, val_y, x_test)

Training until validation scores don't improve for 120 rounds.
[150]	training's rmse: 1.27164	valid_1's rmse: 1.44967
[300]	training's rmse: 1.06447	valid_1's rmse: 1.40813
[450]	training's rmse: 0.945947	valid_1's rmse: 1.40421
Early stopping, best iteration is:
[467]	training's rmse: 0.935491	valid_1's rmse: 1.40417


In [0]:
#Preparing XGBoost Model
def run_xgb(train_X, train_y, val_X, val_y, test_X):
  params = {
      'objective': 'reg:linear',
      'eval_metric': 'rmse',
      'eta': .01,
      'max_depth': 8,
      'subsample': .8,
      'alpha': 0.0001,
      'random_state': 42
  }
  
  train_data = xgb.DMatrix(train_X, train_y)
  valid_data = xgb.DMatrix(val_X, val_y)
  
  watch = [(train_data, 'train'), (valid_data, 'valid')]
  
  model_xgb = xgb.train(params, train_data, 3000, watch, maximize=False,
                       early_stopping_rounds = 120, verbose_eval = 150)
  
  dtest = xgb.DMatrix(test_X)
  xgb_pred_y = np.expm1(model_xgb.predict(dtest, ntree_limit=model_xgb.best_ntree_limit))
  
  return xgb_pred_y, model_xgb

In [0]:
#Running XGBoost Training
pred_test_xgb, model_xgb = run_xgb(devel_X, devel_y, val_X, val_y, x_test)

[0]	train-rmse:13.9387	valid-rmse:14.0484
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 120 rounds.
[150]	train-rmse:3.39192	valid-rmse:3.522
[300]	train-rmse:1.37615	valid-rmse:1.64762
[450]	train-rmse:1.09179	valid-rmse:1.45526
[600]	train-rmse:1.01767	valid-rmse:1.43373
[750]	train-rmse:0.968155	valid-rmse:1.42756
[900]	train-rmse:0.921351	valid-rmse:1.42411
[1050]	train-rmse:0.887643	valid-rmse:1.42331
[1200]	train-rmse:0.853641	valid-rmse:1.42353
Stopping. Best iteration:
[1105]	train-rmse:0.873109	valid-rmse:1.42255



In [0]:
#Preparing data for Lasso
feature = SelectKBest(mutual_info_regression, k=300)
xtrain = feature.fit_transform(X_train, Y_train)
xtest = feature.transform(x_test)

In [0]:
#Preparing Lasso Model
train_data = scaler.fit_transform(xtrain)
test_data = scaler.fit_transform(xtest)
test3 = Lasso(alpha=.0001, max_iter = 12000)

In [0]:
#Running Lasso Training
test3.fit(train_data, Y_train)

Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=12000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [0]:
prediction = test3.predict(test_data)

In [0]:
#Prepare output for submission
sub = pd.read_csv("sample_submission.csv")

sub_lgb = pd.DataFrame()
sub_lgb["target"] = pred_test

sub_xgb = pd.DataFrame()
sub_xgb["target"] = pred_test_xgb

#Adding output to submission file
sub["target"] = (sub_lgb["target"] * 0.5 + sub_xgb["target"] * 0.49 + 
                 np.expm1(prediction) * 0.01)

In [0]:
sub.to_csv('sub_lgb_ xgb_lasso3.csv', index=False)