In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from math import sqrt
from xgboost import XGBRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
train_data = pd.read_csv('../input/30-days-of-ml/train.csv', index_col='id')
test_data = pd.read_csv('../input/30-days-of-ml/test.csv')

feature_columns = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
     'cat9', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6',
       'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13']
len_features = len(feature_columns)

target_cloumn = 'target'
X_data = train_data[feature_columns]
y_data = train_data[target_cloumn]
X_test = test_data[feature_columns]

X_train, X_valid, y_train, y_valid = train_test_split(X_data, y_data, train_size=0.8, test_size=0.2,
                                                      random_state=0)
numerical_cols = [col for col in X_train.columns if X_train[col].dtype != 'object']
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
high_cardinality_cols = [col for col in categorical_cols  if X_train[col].nunique() >= 10]
low_cardinality_cols = [col for col in categorical_cols  if X_train[col].nunique() < 10]


In [1]:
len(X_valid)

In [1]:
print(high_cardinality_cols)
low_cardinality_cols

In [1]:
i=0
a = [col for col in X_train.columns if X_train[col].isnull().any()]
print(a)

**** dropping high cardinality cols ****

In [1]:
y=[]
x=[]
z=[]
numerical_transformer = SimpleImputer(strategy='median')

categorical_transformer1 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

categorical_transformer2 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordi', OrdinalEncoder())
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer1, low_cardinality_cols),
        ('ordi', categorical_transformer2, high_cardinality_cols),
    ])

my_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])



In [1]:
#reduced_X_train = X_train.drop(high_cardinality_cols, axis=1)
#reduced_X_valid = X_valid.drop(high_cardinality_cols, axis=1)
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

ordinal_encoder = OrdinalEncoder()
label_X_train[high_cardinality_cols] = ordinal_encoder.fit_transform(X_train[high_cardinality_cols])
label_X_valid[high_cardinality_cols] = ordinal_encoder.transform(X_valid[high_cardinality_cols])

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(label_X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(label_X_valid[low_cardinality_cols]))

OH_cols_train.index = label_X_train.index
OH_cols_valid.index = label_X_valid.index

num_X_train = label_X_train.drop(low_cardinality_cols, axis=1)
num_X_valid = label_X_valid.drop(low_cardinality_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)




In [1]:
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
my_model.fit(OH_X_train, y_train)

In [1]:

valid_preds = my_model.predict(OH_X_valid)
print(valid_preds)


In [1]:

score = mean_squared_error(y_valid, valid_preds)
mae = mean_absolute_error(y_valid, valid_preds)
print(mae)
print('MSE:', score)
rscore = sqrt(score)
print('RMSE:', rscore)

In [1]:
output = pd.DataFrame({'id': test_data.id ,'target': preds})
output.to_csv('submission_1.csv', index=False)

In [1]:
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

MSE: 0.5219331100061471
RMSE: 0.7224493823141848
    
#drop high cardinality columns

In [1]:
model = RandomForestRegressor(n_estimators=150,random_state=0)
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model', model)])
# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)
preds = my_pipeline.predict(X_valid)

    # Evaluate the model
score = mean_squared_error(y_valid, preds)
print('MSE:', score)
rscore = sqrt(score)
print('RMSE:', rscore)
y.append(rscore)
z.append(score)

#7.36
#drop cat9 col..with high cardinality
preds = my_pipeline.predict(X_test)