In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
!pip install dataprep
from dataprep.eda import *
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
sns.set_theme(style = "darkgrid")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
conda install -c conda-forge swifter

In [None]:
# read data
path = "../input/tabular-playground-series-dec-2021/train.csv"
test_path = "../input/tabular-playground-series-dec-2021/test.csv"
psuedo_labels_path = '../input/tps12-pseudolabels/tps12-pseudolabels_v2.csv'

data = pd.read_csv(path)
psuedo_labels = pd.read_csv(psuedo_labels_path)

In [None]:
# combine given and psuedo labels
data = pd.concat([data, psuedo_labels], axis=0)

In [None]:
# reset index
data.reset_index(drop=True, inplace = True)

In [None]:
# display top 10 rows
data.head(10)

In [None]:
# number of rows and columns in dataset
rows = data.shape[0]
columns = data.shape[1]
print("Data has {} rows, {} columns".format(rows, columns))

In [None]:
data = data.drop(['Id'], axis = 1)

## TRAIN - TEST SPLIT

In [None]:
from sklearn.model_selection import train_test_split

# set of independent variables
X = data.drop(['Cover_Type'], axis = 1)
# dependent variable
y = data['Cover_Type']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state=15)

## OPTIMISE MEMORY

In [None]:
del X
del y
del data

In [None]:
def reduce_mem_usage(df, verbose=True):
    ''' 
    optimises memory usage 
    
    Args:
        df (Pandas DataFrame) : The dataset to optimise
    
    Returns:
        df : Pandas DataFrame
    '''
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# reduce memory usage
X_train = reduce_mem_usage(X_train)
X_val = reduce_mem_usage(X_val)

### COMBINING BINARY VALUED SOIL TYPES TO 40 BIT INTEGERS

In [None]:
X_train["soiltype_label"] = 0
X_val["soiltype_label"] = 0

X_train["soiltype_label"] = X_train["soiltype_label"].astype(np.int64)
X_val["soiltype_label"] = X_val["soiltype_label"].astype(np.int64)

soil_columns = [x for x in X_train.columns if x.startswith("Soil_Type")]

In [None]:
def make_40_bit_int_from_soiltype(row):
    ''' 
    creates 40 bit integer for a given
    
    Args:
        Row (Series) : series to create feature for
    
    Returns:
        int : 40bit integer value
    '''
    value = 0
    for column in soil_columns:
        value |= row[column]
        value = value << 1
    return value

In [None]:
import swifter
X_train["soiltype_label"] = X_train.swifter.apply(make_40_bit_int_from_soiltype, axis=1)

In [None]:
X_val["soiltype_label"] = X_val.swifter.apply(make_40_bit_int_from_soiltype, axis=1)

In [None]:
def make_5_8_bit_ints_from_soiltype(row):
    integer1 = (np.int64(row["soiltype_label"]) & 0xFF00000000) >> 30
    integer2 = (np.int64(row["soiltype_label"]) & 0x00FF000000) >> 24
    integer3 = (np.int64(row["soiltype_label"]) & 0x0000FF0000) >> 16
    integer4 = (np.int64(row["soiltype_label"]) & 0x000000FF00) >> 8
    integer5 = (np.int64(row["soiltype_label"]) & 0x00000000FF)
    return integer1, integer2, integer3, integer4, integer5

In [None]:
X_train[["soiltype_int1", "soiltype_int2", "soiltype_int3", "soiltype_int4", "soiltype_int5"]] = X_train.swifter.apply(make_5_8_bit_ints_from_soiltype, axis=1, result_type="expand")
X_val[["soiltype_int1", "soiltype_int2", "soiltype_int3", "soiltype_int4", "soiltype_int5"]] = X_val.swifter.apply(make_5_8_bit_ints_from_soiltype, axis=1, result_type="expand")

In [None]:
X_train = X_train.drop(['soiltype_label'], axis = 1)

X_val = X_val.drop(['soiltype_label'], axis=1)

# ****EDA****

In [None]:
pd.set_option('display.max_columns', 100)
# display descriptive stats
X_train.describe()

## UNIVARIATE ANALYSIS

**ELEVATION**

Distribution : About Normal

Outlier : Yes

Null : No

Incorrect Values : No

In [None]:
plot(X_train, col1 = 'Elevation')

**ASPECT**

Distribution : About Skewed

Outlier : No

Null : No

Incorrect Values : Yes

In [None]:
plot(X_train, col1 = 'Aspect')

Since Aspect is measured in Degree Azimuth, and it ranges from 0 to 360. Thus considering all values less than 0 as 360+x and all values greater than 360 as x-360.

In [None]:
# select all rows with incorrect values
incorrect_aspect_train = ((X_train['Aspect']<0) | (X_train['Aspect']>360))
incorrect_aspect_val = ((X_val['Aspect']<0) | (X_val['Aspect']>360))

# correct values
X_train.loc[incorrect_aspect_train, 'Aspect'] = X_train[incorrect_aspect_train]['Aspect']%360
X_val.loc[incorrect_aspect_val, 'Aspect'] = X_val[incorrect_aspect_val]['Aspect']%360

del incorrect_aspect_train
del incorrect_aspect_val

In [None]:
plot(X_train, col1 = "Aspect")

**SLOPE**

Distribution : Positively Skewed

Outlier : Yes

Null : No

Incorrect Values : No (Assuming negative values denote a down slope)

In [None]:
plot(X_train, col1 = 'Slope')

**Horizontal_Distance_To_Hydrology**

Distribution : Positively Skewed

Outlier : Yes

Null : No

Incorrect Values : Yes

In [None]:
plot(X_train, col1 = 'Horizontal_Distance_To_Hydrology')

Since distance as a scalar can't be negative or positive thus taking modulus of negative values

In [None]:
# correct negative values
X_train['Horizontal_Distance_To_Hydrology'] = abs(X_train['Horizontal_Distance_To_Hydrology'])
X_val['Horizontal_Distance_To_Hydrology'] = abs(X_val['Horizontal_Distance_To_Hydrology'])

In [None]:
plot(X_train, col1 = 'Horizontal_Distance_To_Hydrology')

**Vertical_Distance_To_Hydrology**

Distribution : Positively Skewed

Outlier : Yes

Null : No

Incorrect Values : Yes

In [None]:
plot(X_train, col1 = 'Vertical_Distance_To_Hydrology')

Correcting negative values

In [None]:
# correct negative values
X_train['Vertical_Distance_To_Hydrology'] = abs(X_train['Vertical_Distance_To_Hydrology'])
X_val['Vertical_Distance_To_Hydrology'] = abs(X_val['Vertical_Distance_To_Hydrology'])

In [None]:
plot(X_train, col1 = 'Vertical_Distance_To_Hydrology')

**Horizontal_Distance_To_Roadways**

Distribution : Positively Skewed

Outlier : Yes

Null : No

Incorrect Values : Yes

In [None]:
plot(X_train, col1 = 'Horizontal_Distance_To_Roadways')

In [None]:
# correct negative values
X_train['Horizontal_Distance_To_Roadways'] = abs(X_train['Horizontal_Distance_To_Roadways'])
X_val['Horizontal_Distance_To_Roadways'] = abs(X_val['Horizontal_Distance_To_Roadways'])

In [None]:
plot(X_train, col1 = 'Horizontal_Distance_To_Roadways')

**Hillshade_9am**

Distribution : Negatively Skewed

Outlier : Yes

Null : No

Incorrect Values : Yes

In [None]:
plot(X_train, col1 = 'Hillshade_9am')

Since the values in Hillshade columns have to be from 0 to 255 thus correcting wherever necessary

In [None]:
# correct negative values
X_train['Hillshade_9am'] = abs(X_train['Hillshade_9am'])
X_val['Hillshade_9am'] = abs(X_val['Hillshade_9am'])

In [None]:
# selecting incorrect values
incorrect_hillshade_9am_train = (X_train['Hillshade_9am']>255)
incorrect_hillshade_9am_val = (X_val['Hillshade_9am']>255)

# correcting values
X_train.loc[incorrect_hillshade_9am_train, 'Hillshade_9am'] = X_train[incorrect_hillshade_9am_train]['Hillshade_9am']%255
X_val.loc[incorrect_hillshade_9am_val, 'Hillshade_9am'] = X_val[incorrect_hillshade_9am_val]['Hillshade_9am']%255

del incorrect_hillshade_9am_train
del incorrect_hillshade_9am_val

In [None]:
plot(X_train, col1 = 'Hillshade_9am')

**Hillshade_Noon**

Distribution : Negatively Skewed

Outlier : Yes

Null : No

Incorrect Values : Yes

In [None]:
plot(X_train, col1 = 'Hillshade_Noon')

In [None]:
# correct negative values
X_train['Hillshade_Noon'] = abs(X_train['Hillshade_Noon'])
X_val['Hillshade_Noon'] = abs(X_val['Hillshade_Noon'])

In [None]:
# selecting incorrect values
incorrect_hillshade_noon_train = (X_train['Hillshade_Noon']>255)
incorrect_hillshade_noon_val = (X_val['Hillshade_Noon']>255)

# correcting values
X_train.loc[incorrect_hillshade_noon_train, 'Hillshade_Noon'] = X_train[incorrect_hillshade_noon_train]['Hillshade_Noon']%255
X_val.loc[incorrect_hillshade_noon_val, 'Hillshade_Noon'] = X_val[incorrect_hillshade_noon_val]['Hillshade_Noon']%255

del incorrect_hillshade_noon_train
del incorrect_hillshade_noon_val

In [None]:
plot(X_train, col1='Hillshade_Noon')

**Hillshade_3pm**

Distribution : About Normal

Outlier : Yes

Null : No

Incorrect Values : Yes

In [None]:
plot(X_train, col1 = 'Hillshade_3pm')

In [None]:
# correct negative values
X_train['Hillshade_3pm'] = abs(X_train['Hillshade_3pm'])
X_val['Hillshade_3pm'] = abs(X_val['Hillshade_3pm'])

In [None]:
# selecting incorrect values
incorrect_hillshade_3pm_train = (X_train['Hillshade_3pm']>255)
incorrect_hillshade_3pm_val = (X_val['Hillshade_3pm']>255)

# correcting values
X_train.loc[incorrect_hillshade_3pm_train, 'Hillshade_3pm'] = X_train[incorrect_hillshade_3pm_train]['Hillshade_3pm']%255
X_val.loc[incorrect_hillshade_3pm_val, 'Hillshade_3pm'] = X_val[incorrect_hillshade_3pm_val]['Hillshade_3pm']%255

del incorrect_hillshade_3pm_train
del incorrect_hillshade_3pm_val

In [None]:
plot(X_train, col1='Hillshade_3pm')

**Horizontal_Distance_To_Fire_Points**

Distribution : Positively Skewed

Outlier : Yes

Null : No

Incorrect Values : Yes

In [None]:
plot(X_train, col1 = 'Horizontal_Distance_To_Fire_Points')

In [None]:
# correct negative values
X_train['Horizontal_Distance_To_Fire_Points'] = abs(X_train['Horizontal_Distance_To_Fire_Points'])
X_val['Horizontal_Distance_To_Fire_Points'] = abs(X_val['Horizontal_Distance_To_Fire_Points'])

In [None]:
plot(X_train, col1 = 'Horizontal_Distance_To_Fire_Points')

**BINARY COLUMNS**

In [None]:
bin_columns = X_train.columns[10:]
for i in bin_columns:
    plot(X_train, col1=i).show()

## BIVARIATE ANALYSIS

In [None]:
train = X_train.copy()
train['cover'] = y_train

In [None]:
train_sample = train.sample(n=100000)
plot_correlation(train_sample, config = {'height': 800, 'width': 800, })

- Wilderness Area 1 and 3 are strongly correlated
- Elevation is a good indicator of Cover_Type, followed by Wilderness_Area4 & 1, 3, Soil_Type39, Horizontal_Distance_To_Roadways
- Wilderness_Area1 and 3 are correlated to Soil_Type29
- Wilderness_Area4 is correlated to Soil_Type6
- Wilderness_Area3 and 4 are correlated
- Elevation and Wilderness_Area4 are correlated
- Amongst Soil_Types 39 has highest correlation with Cover_Type
- Horizontal_Distance_To_Roadways is correlated to Elevation, Wilderness_Area4
- Wilderness_Area4 is correlated to Horizontal_Distance_To_Firepoints
- Wilderness_Area1, 3 and 4 are correlated to Soil_Type10
- Soil_Type3 & Wilderness_Area4

In [None]:
del train
del train_sample

# **DATA PREPROCESSING**

In [None]:
X_train = reduce_mem_usage(X_train)
X_val = reduce_mem_usage(X_val)

## HANDLE SKEWNESS

In [None]:
def removeSkew(X, skew_index):
    ''' 
    Removes columns with skew distribution using "skew_index" array
    
    Args:
        X (Numpy Array) : The dataset to remove skew columns from
        skew_index (Numpy Array) : List of columns to remove from "X"
    
    Returns:
        Numpy Array : The data without skew columns
    '''
    return np.array(pd.DataFrame(X).drop(skew_index, axis=1))

In [None]:
def reportSkewness(X, columns_to_drop):
    '''
    Reports columns with skew distribution in given dataset X
    
    Args:
        X (DataFrame) : The dataset to check skewness in
        columns_to_drop (list) : columns to not use in analyses
        
    Returns:
        list : list of columns with skew distribution
    '''
    skew = X.drop(columns_to_drop, axis = 1).skew()

#     columns with asymmetrical distribution
    skew_index = np.array(skew[~((skew>=-0.5) & (skew <= 0.5))].index)

    print("Number of columns with skew distribution : {}".format(len(skew_index)))
    return skew_index

In [None]:
# columns with categorical values types
categorical_columns = X_train.columns[10:]
skew_columns = reportSkewness(X_train, categorical_columns)

In [None]:
skew_columns

In [None]:
for column in skew_columns:
    if(column!='Slope'):
#     set up plot
        f, ax = plt.subplots(nrows=1, ncols=2, figsize = (10, 7))
#     plot before transformation
        sns.distplot(X_train[column], ax = ax[0])
#     apply square root transformation
        X_train[column] = np.sqrt(X_train[column])
        X_val[column] = np.sqrt(X_val[column])
#     plot after transformation
        sns.distplot(X_train[column], ax = ax[1])
        plt.show()

## HANDLE OUTLIERS

In [None]:
def handleOutliers(data, target, to_return = False):
    ''' 
    Removes outliers from each column and reports the data loss
    
    Args:
        data (DataFrame) : The DataFrame to remove outliers from
        target : target variable
        to_return (bool) :  - Default value False
                            - Whether to return the DataFrame after removing outliers
    
    Returns:
        DataFrame : data free from outliers
    '''
#     calculate first quantile
    Q1 = data.quantile(0.25)
#     calculate third quantile
    Q3 = data.quantile(0.75)
#     calculate inter quartile range
    IQR1 = Q3-Q1

#     initialise data w/o outliers (drop outliers)
    data_c = data[~((data < (Q1-1.5*IQR1))|(data > (Q3+1.5*IQR1))).any(axis = 1)] 
    y_train = target[~((data < (Q1-1.5*IQR1))|(data > (Q3+1.5*IQR1))).any(axis = 1)] 
    
#     report data loss
    print('Data loss is {}%'.format(((len(data) - len(data_c))/len(data))*100))
    
    if(to_return):
        return (data_c, y_train)

In [None]:
handleOutliers(X_train.drop(bin_columns, axis=1), y_train)

In [None]:
def countOutliers(data, column):
    ''' 
    Calculates the number of outliers in given column
    
    Args:
        data (DataFrame) : The dataset in form of Pandas DataFrame
        column (string) : The column to report number of outliers in
    
    Returns:
        int : percentage of outliers in column
    '''
#     calculate first quantile
    Q1 = data[column].quantile(0.25)
#     calculate third quantile
    Q3 = data[column].quantile(0.75)
#     calculate inter quartile range
    IQR1 = Q3-Q1
    
#     % of outliers in the column
    return (len(data[((data[column] < (Q1-1.5*IQR1))|(data[column] > (Q3+1.5*IQR1)))])/len(data))*100

In [None]:
def columnWiseOutliers(data):
    ''' 
    Calculates the number of outliers in each column
    
    Args:
        data (DataFrame) : The dataset in form of Pandas DataFrame
    
    Returns:
        DataFrame : percentage of outliers in columns
    '''
#     percentage of outliers in each column
    outliers = []

    for column in data.columns:
        outliers.append([column, countOutliers(data, column)])
#     sort in decreasing order
    outliers.sort(key = lambda x: x[1], reverse = True)
#     convert to DataFrame
    df = pd.DataFrame(outliers, columns=['Col', '%Outliers'])
    return df

In [None]:
columnWiseOutliers(X_train.drop(bin_columns, axis =1))

In [None]:
X_trainc, y_train = handleOutliers(X_train.drop(bin_columns, axis =1), y_train, True)

In [None]:
X_train = X_train.loc[X_trainc.index]

In [None]:
del X_trainc

## HANDLE SINGLE VALUED COLUMNS

In [None]:
# columns with single values
single_val_cols = ['Soil_Type7', 'Soil_Type15']

# drop single valued columns
X_train=X_train.drop(single_val_cols, axis=1)
X_val=X_val.drop(single_val_cols, axis=1)

## FEATURE ENGINEERING

In [None]:
# compute Euclidean distance to hydrology
X_train['Dist_To_Hydro'] = ((X_train['Horizontal_Distance_To_Hydrology'])**2 + (X_train['Vertical_Distance_To_Hydrology'])**2)**0.5

# compute Manhattan distance to hydrology
X_train['MDist_To_Hydro'] = X_train['Horizontal_Distance_To_Hydrology']+X_train['Vertical_Distance_To_Hydrology']

# applying other operations
X_train['V_Dist_Hydro_Min_Elev'] = abs(X_train['Vertical_Distance_To_Hydrology']-X_train['Elevation'])

X_train['V_Dist_Hydro_Add_Elev'] = X_train['Vertical_Distance_To_Hydrology']+X_train['Elevation']

X_train['H_Dist_Hydro_Min_Elev'] = abs(X_train['Horizontal_Distance_To_Hydrology']-X_train['Elevation'])

X_train['H_Dist_Hydro_Add_Elev'] = X_train['Horizontal_Distance_To_Hydrology']+X_train['Elevation']

X_train['Slope_Per_Elev'] = X_train['Slope']/X_train['Elevation']

In [None]:
# apply for test data
X_val['Dist_To_Hydro'] = ((X_val['Horizontal_Distance_To_Hydrology'])**2 + (X_val['Vertical_Distance_To_Hydrology'])**2)**0.5

X_val['MDist_To_Hydro'] = X_val['Horizontal_Distance_To_Hydrology']+X_val['Vertical_Distance_To_Hydrology']

X_val['V_Dist_Hydro_Min_Elev'] = abs(X_val['Vertical_Distance_To_Hydrology']-X_val['Elevation'])

X_val['V_Dist_Hydro_Add_Elev'] = X_val['Vertical_Distance_To_Hydrology']+X_val['Elevation']

X_val['H_Dist_Hydro_Min_Elev'] = abs(X_val['Horizontal_Distance_To_Hydrology']-X_val['Elevation'])

X_val['H_Dist_Hydro_Add_Elev'] = X_val['Horizontal_Distance_To_Hydrology']+X_val['Elevation']

X_val['Slope_Per_Elev'] = X_val['Slope']/X_val['Elevation']

In [None]:
# Soil type count
soil_features = [x for x in X_train.columns if x.startswith("Soil_Type")]
X_train["Soil_Type_Count"] = X_train[soil_features].sum(axis=1)
X_val["Soil_Type_Count"] = X_val[soil_features].sum(axis=1)

# Wilderness area count
wilderness_features = [x for x in X_train.columns if x.startswith("Wilderness_Area")]
X_train["Wilderness_Area_Count"] = X_train[wilderness_features].sum(axis=1)
X_val["Wilderness_Area_Count"] = X_val[wilderness_features].sum(axis=1)

## SCALE VALUES

In [None]:
from sklearn.preprocessing import RobustScaler


cols = [
    "Elevation",
    "Aspect",
    "Dist_To_Hydro",
    "MDist_To_Hydro",
    "Soil_Type_Count",
    "Wilderness_Area_Count",
    "Slope",
    "Horizontal_Distance_To_Hydrology",
    'V_Dist_Hydro_Min_Elev',
    'V_Dist_Hydro_Add_Elev',
    'H_Dist_Hydro_Min_Elev',
    'H_Dist_Hydro_Add_Elev',
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points",
]

scaler = RobustScaler()
X_train[cols] = scaler.fit_transform(X_train[cols])
X_val[cols] = scaler.transform(X_val[cols])

## CLASS IMBALANCE

In [None]:
f, ax = plt.subplots(nrows = 1, ncols = 2, figsize = (15, 12)) # set up plot 

# visualise distribution in Ytrain

sns.countplot(x = y_train, ax = ax[0]) # plot bar plot

# plot a pie chart
ax[1].pie(x = y_train.value_counts().values, labels = y_train.value_counts().index, autopct = "%.2f%%")
ax[1].set_title("Percentage distribution") # set title

# display plots
plt.show()

# MACHINE LEARNING

In [None]:
from sklearn.metrics import accuracy_score

def getScore(clf, X, y):
    ''' 
    Calculates f1_score for the given data using "clf"
    
    Args:
        clf : The instance for classification algorithm
        X (Numpy Array) : The dependent variables
        y (Numpy Array) : The independent variables
    
    Returns:
        float : The score calculated
    '''
    return accuracy_score(y, clf.predict(X))


In [None]:
def printReport(clf, X_train, y_train, X_val, y_val):
    ''' 
    reports accuracy_score for the training and validation data using "clf"
    
    Args:
        clf : The instance for classification algorithm
        X_train (Numpy Array) : The dependent variables for training
        y_train (Numpy Array) : The independent variables for training
        X_val (Numpy Array) : The dependent variables for validation
        y_val (Numpy Array) : The independent variables for validation
    
    Returns:
        (float, float) : Training & Validation score
    '''
#     find scores
    train_score = getScore(clf, X_train, y_train)
    val_score = getScore(clf, X_val, y_val)
    
    print("Training Score : {}\nValidation Score: {}".format(train_score, val_score))
    
    return (train_score, val_score)

In [None]:
from xgboost import XGBClassifier

# intialise XGB algorithm
clf_xgb = XGBClassifier(tree_method='gpu_hist', eta = 0.5)

# fit on training data
clf_xgb.fit(X_train, y_train)

xgb_scores = printReport(clf_xgb, X_train, y_train, X_val, y_val)

In [None]:
# preds = clf_xgb.predict(X_val)

In [None]:
# res = pd.DataFrame(test_id, columns = ['Id'])
# res['Cover_Type'] = preds

In [None]:
# res.to_csv("res6.csv", index=False)