In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from statsmodels.tools import categorical
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

import xgboost
from xgboost import XGBRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mmf-data-science-2022/sample_submission.csv
/kaggle/input/mmf-data-science-2022/train.csv
/kaggle/input/mmf-data-science-2022/test.csv


In [7]:
df_train = pd.read_csv('../input/mmf-data-science-2022/train.csv', index_col=0)
df_train = pd.get_dummies(df_train)

X = df_train.drop('price', axis=1)
y = df_train['price']

df_test = pd.read_csv('../input/mmf-data-science-2022/test.csv', index_col='id')
df_test = pd.get_dummies(df_test)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size = .75)

In [11]:
lin_model = sm.OLS(y_train, X_train).fit()
print(lin_model.summary())

vif_summary = pd.DataFrame()
vif_summary["predictor"] = X_train.columns
vif_summary["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(len(X_train.columns))]
  
print(vif_summary)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.920
Model:                            OLS   Adj. R-squared:                  0.920
Method:                 Least Squares   F-statistic:                 1.614e+04
Date:                Fri, 25 Nov 2022   Prob (F-statistic):               0.00
Time:                        07:49:26   Log-Likelihood:            -2.7342e+05
No. Observations:               32365   AIC:                         5.469e+05
Df Residuals:                   32341   BIC:                         5.471e+05
Df Model:                          23                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
carat          1.136e+04     63.168    179.817

  vif = 1. / (1. - r_squared_i)


        predictor        VIF
0           carat  22.645690
1           depth   2.500656
2           table   1.781998
3               x  87.796379
4               y  13.786220
5               z  72.195249
6        cut_Fair        inf
7        cut_Good        inf
8       cut_Ideal        inf
9     cut_Premium        inf
10  cut_Very Good        inf
11        color_D        inf
12        color_E        inf
13        color_F        inf
14        color_G        inf
15        color_H        inf
16        color_I        inf
17        color_J        inf
18     clarity_I1        inf
19     clarity_IF        inf
20    clarity_SI1        inf
21    clarity_SI2        inf
22    clarity_VS1        inf
23    clarity_VS2        inf
24   clarity_VVS1        inf
25   clarity_VVS2        inf


In [13]:
y_pred = lin_model.predict(X_test)
print(mean_squared_error(y_train, lin_model.predict(X_train)))
print(mean_squared_error(y_pred, y_test))

1274991.4538794998
1348035.1360985942


In [4]:
def train_model(X, y, n_estimators=330, max_depth=9, eta=0.05, subsample=0.7, colsample_bytree=0.8):
    model = XGBRegressor()
    model = XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, eta=eta, subsample=subsample, colsample_bytree=colsample_bytree)
    model.fit(X, y)
    return model

model = train_model(X_train, y_train)
y_pred = model.predict(X_test)
print(y_pred)

print(mean_squared_error(y_train, model.predict(X_train)))
print(mean_squared_error(y_pred, y_test))

[1785.1399  543.3673  885.4657 ...  428.0165  751.5482 9522.319 ]
69672.18630930065
288771.3206776527


In [5]:
# K-Fold cross validation
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
print(scores)

# within a loop
est_range = np.arange(100, 1000, 100)
print(est_range)

scoress = []
for est in est_range:
    model = train_model(X_train, y_train, n_estimator=est)
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    scoress.append(scores)

print(scoress)

[-259.34341034 -255.42492117 -279.25893889 -272.93072227 -276.66135452
 -276.51728552 -261.00141639 -266.05908697 -265.44402116 -276.42644669
 -250.70892408 -271.88447065 -277.00800338 -265.8290423  -264.66106111
 -273.23961151 -278.89760647 -272.04635194 -270.00804904 -268.80424843
 -266.04429463 -259.36418677 -278.63502405 -265.42998727 -266.8768582
 -263.15496397 -271.81159373 -274.34522189 -266.04877967 -272.86599622]
[100 200 300 400 500 600 700 800 900]


TypeError: train_model() got an unexpected keyword argument 'n_estimator'

In [None]:
# real test
comp_model = train_model(X, y)

y_pred = comp_model.predict(df_test)
prediction = pd.DataFrame(y_pred, columns=['predictions']).to_csv('/kaggle/working/prediction.csv')