# Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from catboost import Pool, CatBoostClassifier, cv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import xgboost as xgb


# Reduce memory

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
#                 elif

                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
def nan_analize(X):
    nan_df = (X.isna().sum() / X.shape[0]).reset_index()
    nan_df.columns=['feature', 'nan_percent']

    plt.figure(figsize=(16,4))
    sns.barplot(nan_df['feature'], nan_df['nan_percent'])
    plt.title('Percent Missing')
    plt.ylabel('Missing', fontsize=12)
    plt.xlabel('Features', fontsize=12)
    plt.xticks(rotation=90)
    plt.show()

In [None]:
def evaluate_preds(true_values, pred_values, save=False):
    """Оценка качества модели и график preds vs true"""
    
    print("R2:\t" + str(round(r2(true_values, pred_values), 5)) + "\n" +
          "RMSE:\t" + str(round(np.sqrt(mse(true_values, pred_values)), 3)) + "\n" +
          "MSE:\t" + str(round(mse(true_values, pred_values), 3))
         )
    
    plt.figure(figsize=(8,8))
    
    sns.scatterplot(x=pred_values, y=true_values)
    plt.plot([0, 500000], [0, 500000], linestyle='--', color='black')  # диагональ, где true_values = pred_values
    
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('True vs Predicted values')
    
    if save == True:
        plt.savefig(REPORTS_FILE_PATH + 'report.png')
    plt.show()

In [None]:
def plot_feature_importances(importances, X):
    
    indices = np.argsort(importances)[::-1]

    plt.figure(figsize = (20, 6))
    plt.title("Feature importances", fontsize=16)
    plt.bar(range(X.shape[1]), importances[indices] / importances.sum(),
           color="darkblue", align="center")
    plt.xticks(range(X.shape[1]), X.columns[indices], rotation = 90, fontsize=14)
    plt.xlim([-1, X.shape[1]])

    plt.tight_layout()
    # plt.savefig('fe.jpg')
    plt.show()

# Input datasets

In [None]:
ans = pd.read_csv("/kaggle/input/tabular-playground-series-dec-2021/sample_submission.csv")
df_train = pd.read_csv("/kaggle/input/tabular-playground-series-dec-2021/train.csv")
df_test = pd.read_csv("/kaggle/input/tabular-playground-series-dec-2021/test.csv")

In [None]:
X = df_train.drop('Cover_Type', axis=1)
y = df_train['Cover_Type']

X.set_index('Id', inplace=True)

In [None]:
X_final = df_test
X_final.set_index('Id', inplace=True)

In [None]:
X = reduce_mem_usage(X)

In [None]:
X_final = reduce_mem_usage(X_final)

# Analysis datasets

In [None]:
X.columns

In [None]:
arr = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points']
for i in arr:
    print(i)
    print(X[i].describe().apply(lambda x: format(x, 'f')))
    print()

In [None]:
#1
X.loc[X['Aspect'] < 0, 'Aspect'] = X.loc[X['Aspect'] < 0, 'Aspect'] + 360
X_final.loc[X_final['Aspect'] < 0, 'Aspect'] = X_final.loc[X_final['Aspect'] < 0, 'Aspect'] + 360

In [None]:
#3
X.loc[X['Horizontal_Distance_To_Hydrology'] < 0, 'Horizontal_Distance_To_Hydrology'] = X.loc[X['Horizontal_Distance_To_Hydrology'] < 0, 'Horizontal_Distance_To_Hydrology'] * -1
X_final.loc[X_final['Horizontal_Distance_To_Hydrology'] < 0, 'Horizontal_Distance_To_Hydrology'] = X_final.loc[X_final['Horizontal_Distance_To_Hydrology'] < 0, 'Horizontal_Distance_To_Hydrology'] * -1

In [None]:
#4
X.loc[X['Vertical_Distance_To_Hydrology'] < 0, 'Vertical_Distance_To_Hydrology'] = X.loc[X['Vertical_Distance_To_Hydrology'] < 0, 'Vertical_Distance_To_Hydrology'] * -1
X_final.loc[X_final['Vertical_Distance_To_Hydrology'] < 0, 'Vertical_Distance_To_Hydrology'] = X_final.loc[X_final['Vertical_Distance_To_Hydrology'] < 0, 'Vertical_Distance_To_Hydrology'] * -1

In [None]:
#5
X.loc[X['Horizontal_Distance_To_Roadways'] < 0, 'Horizontal_Distance_To_Roadways'] = X.loc[X['Horizontal_Distance_To_Roadways'] < 0, 'Horizontal_Distance_To_Roadways'] * -1
X_final.loc[X_final['Horizontal_Distance_To_Roadways'] < 0, 'Horizontal_Distance_To_Roadways'] = X_final.loc[X_final['Horizontal_Distance_To_Roadways'] < 0, 'Horizontal_Distance_To_Roadways'] * -1

In [None]:
#6
X.loc[256 <= X['Hillshade_9am'], 'Hillshade_9am'] = X.loc[256 <= X['Hillshade_9am'], 'Hillshade_9am'] % 256
X_final.loc[256 <= X_final['Hillshade_9am'], 'Hillshade_9am'] = X_final.loc[256 <= X_final['Hillshade_9am'], 'Hillshade_9am'] % 256

In [None]:
#7
X.loc[256 <= X['Hillshade_Noon'], 'Hillshade_Noon'] = X.loc[256 <= X['Hillshade_Noon'], 'Hillshade_Noon'] % 256
X_final.loc[256 <= X_final['Hillshade_Noon'], 'Hillshade_Noon'] = X_final.loc[256 <= X_final['Hillshade_Noon'], 'Hillshade_Noon'] % 256

In [None]:
#8
X.loc[256 <= X['Hillshade_3pm'], 'Hillshade_3pm'] = X.loc[256 <= X['Hillshade_3pm'], 'Hillshade_3pm'] % 256
X_final.loc[256 <= X_final['Hillshade_3pm'], 'Hillshade_3pm'] = X_final.loc[256 <= X_final['Hillshade_3pm'], 'Hillshade_3pm'] % 256

In [None]:
#9
X.loc[X['Hillshade_9am'] < 0, 'Hillshade_9am'] = X.loc[X['Hillshade_9am'] < 0, 'Hillshade_9am'] % 256
X_final.loc[X_final['Hillshade_9am'] < 0, 'Hillshade_9am'] = X_final.loc[X_final['Hillshade_9am'] < 0, 'Hillshade_9am'] % 256

In [None]:
#10
X.loc[X['Hillshade_Noon'] < 0, 'Hillshade_Noon'] = X.loc[X['Hillshade_Noon'] < 0, 'Hillshade_Noon'] % 256
X_final.loc[X_final['Hillshade_Noon'] < 0, 'Hillshade_Noon'] = X_final.loc[X_final['Hillshade_Noon'] < 0, 'Hillshade_Noon'] % 256

In [None]:
#11
X.loc[X['Hillshade_3pm'] < 0, 'Hillshade_3pm'] = X.loc[X['Hillshade_3pm'] < 0, 'Hillshade_3pm'] % 256
X_final.loc[X_final['Hillshade_3pm'] < 0, 'Hillshade_3pm'] = X_final.loc[X_final['Hillshade_3pm'] < 0, 'Hillshade_3pm'] % 256

In [None]:
#12
X.loc[X['Horizontal_Distance_To_Fire_Points'] < 0, 'Horizontal_Distance_To_Fire_Points'] = X.loc[X['Horizontal_Distance_To_Fire_Points'] < 0, 'Horizontal_Distance_To_Fire_Points'] * -1
X_final.loc[X_final['Horizontal_Distance_To_Fire_Points'] < 0, 'Horizontal_Distance_To_Fire_Points'] = X_final.loc[X_final['Horizontal_Distance_To_Fire_Points'] < 0, 'Horizontal_Distance_To_Fire_Points'] * -1

In [None]:
arr = ['Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
       'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
       'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
       'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
       'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
       'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
       'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
       'Soil_Type39', 'Soil_Type40']
for i in arr:
    print(i)
    print(X[i].describe().apply(lambda x: format(x, 'f')))
    print()

In [None]:
X_final.describe()

In [None]:
X.info()

In [None]:
X_final.info()

# Train model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
model =CatBoostClassifier(thread_count=3,
                       iterations=500,
                       l2_leaf_reg = 4.0, 
                       learning_rate = 0.035,
                       random_seed=63)

#y_train.ravel()


model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)

In [None]:
model.score(X_test, y_test)

In [None]:
model_ans = CatBoostClassifier(thread_count=3,
                       iterations=500,
                       l2_leaf_reg = 4.0, 
                       learning_rate = 0.035,
                       random_seed=63)

model_ans.fit(X, y)

y_pred_test = model_ans.predict(X)
model_ans.score(X, y)

In [None]:
y_pred_final = model_ans.predict(X_final)
y_pred_final

In [None]:
y_pred_df = pd.DataFrame(y_pred_final, columns=['target1'])
ans_pd = y_pred_df

In [None]:
ans_pd

In [None]:
ans['Cover_Type'] = ans_pd
ans.to_csv('submission.csv',index=False)

In [None]:
ans