# Importing Packages

In [None]:
import pandas as pd
import numpy as np
from column_encoder import *
from sklearn.feature_selection import SelectFromModel
from catboost import CatBoostRegressor
from statsmodels.stats.outliers_influence import variance_inflation_factor
import seaborn as sns
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
cm = sns.light_palette("orange", as_cmap=True)

# Importing the data

In [None]:
train=pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
test=pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')

# Transforming the categorical data to numeric

In [None]:
fit(train,list(train.columns[1:11]))
transform(train)
transform(test)

In [None]:
train.head()

# Select From Model Feature Selection

In [None]:
train.drop('id',axis=1,inplace=True)
X=train.drop(['target'],axis=1).values
y=train['target'].values

In [None]:
# Select from model feature selection
selector = SelectFromModel(estimator=CatBoostRegressor()).fit(X, y)

In [None]:
# Getting a mask of the features used using get support function
selector.get_support()

In [None]:
# Selecting the features that weren't masked 
featured_columns=[]
t=list(train.drop('target',axis=1).columns)
for i in range(len(t)):
    if selector.get_support()[i]==True:
        featured_columns.append(t[i])
    else:
        continue
train=train[featured_columns]
test=test[featured_columns]

# Checking for Multicollinearity

In [None]:
# Checking for Multicollinearity
# Making a vif function for the dataset
def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    s = vif.style.background_gradient(cmap=cm)
    return(s)

In [None]:
# Checking multicollinearity in our training data
calc_vif(train)

In [None]:
train.drop(['cont0',"cont5",'cont9',"cont11"],axis=1,inplace=True)

In [None]:
test=test[list(train.columns)]

In [None]:
train['target']=y

# Removing Outliers

In [None]:
z_scores = zscore(train)

abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
train = train[filtered_entries]

# Using Catboost Model 

In [None]:
model=CatBoostRegressor()

In [None]:
params={'nan_mode': 'Min',
 'eval_metric': 'RMSE',
 'iterations': 300,
 'sampling_frequency': 'PerTree',
 'leaf_estimation_method': 'Newton',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'l2_leaf_reg': 7,
 'random_strength': 0.4000000059604645,
 'rsm': 1,
 'boost_from_average': True,
 'model_size_reg': 0.5,
 'subsample': 0.800000011920929,
 'use_best_model': False,
 'random_seed': 3042,
 'depth': 11,
 'posterior_sampling': False,
 'border_count': 254,
 'sparse_features_conflict_fraction': 0,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'best_model_min_trees': 1,
 'model_shrink_rate': 0,
 'min_data_in_leaf': 1,
 'loss_function': 'RMSE',
 'learning_rate': 0.029999999329447743,
 'score_function': 'Cosine',
 'task_type': 'CPU',
 'leaf_estimation_iterations': 1,
 'bootstrap_type': 'MVS',
 'max_leaves': 2048,
    'verbose':False}

In [None]:
model=CatBoostRegressor(**params)

# Using KFolds

In [None]:
skf = KFold(n_splits=10,random_state=None,shuffle=False)
l=[]
count=0
for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train,y_train)
        print("TRAIN:", train_index, "TEST:", test_index)
        print('mean squred error is ',mean_squared_error(model.predict(X_test),y_test))
        l.append(mean_squared_error(model.predict(X_test),y_test))
        count+=1
        if mean_squared_error(model.predict(X_test),y_test)==min(l):
            main_model=CatBoostRegressor(**params)
            main_model.fit(X_train,y_train)
            print('new model trained on fold ',count-1)
        else:
            continue


In [None]:
test2=pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')

In [None]:
sample=pd.DataFrame()
sample['id']=test2['id']
X_test=test.values

# Doing Predictions and saving the file

In [None]:
predictions=main_model.predict(X_test)

In [None]:
sample['target']=predictions

In [None]:
sample.to_csv('submission.csv',index=False)

# Thank you