#           Dont forget to Uovote if this notebook is helpful.


**Objective:** Idea behind this notebook is Simple EDA with two Models.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from  datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import datetime as datetime
from sklearn.metrics import accuracy_score
from category_encoders.target_encoder import TargetEncoder
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")

### Explore data

In [None]:
train.head(100)

In [None]:
train.tail()

In [None]:
train.columns

In [None]:
#10 columns check
train.describe().T.head(10)

### Through all the columns of a dataframe and modify the data type to reduce memory usage

In [None]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
# drop ID
train = train.drop('Id', axis=1)
test=test.drop('Id', axis=1)

In [None]:
train.describe().T.drop('count', axis=1).head(10).style.highlight_max(color='darkred')

In [None]:
# Choose only numerical columns
train.select_dtypes(include=np.number).head()

In [None]:
# print("Mean",np.mean(train))
# print("Standard Deviation", np.std(train))
# print("Min", np.min(train))
# print("Max", np.max(train))

In [None]:
train['Cover_Type'].describe()

In [None]:
train.groupby('Cover_Type').size()

In [None]:
plt.hist(train['Cover_Type'])

In [None]:
#skewness and kurtosis
print("Skewness: %f" % train['Cover_Type'].skew())
print("Kurtosis: %f" % train['Cover_Type'].kurt())

### Relationship with numerical variables

In [None]:
var = 'Hillshade_Noon'
data = pd.concat([train['Cover_Type'], train[var]], axis=1)
data.plot.scatter(x=var, y='Cover_Type', ylim=(0,800000))

In [None]:
#most correlated 10 columns
corr=train.corr()
v=10
colmn = corr.nlargest(v, 'Cover_Type')
colmn

### Correlation

In [None]:
colmn = corr.nlargest(v, 'Cover_Type')['Cover_Type'].index 
xm = np.corrcoef(train[colmn].values.T)
sns.set(font_scale=1.25)
plt.figure(figsize=(18, 18))
hm = sns.heatmap(xm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
                 yticklabels=colmn.values, xticklabels=colmn.values)
plt.show()


### Missing Value
There is no missing values.

In [None]:
total= train.isnull().sum().sort_values(ascending=False)
total.head()

In [None]:
#drop output
train_X = train.drop('Cover_Type', axis=1)
#contain output on y container
train_y = train['Cover_Type']

In [None]:
train_y.head()

### Split the dataset

In [None]:
#trin_test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.22, random_state=2021 )

In [None]:
del train, train_X, train_y

In [None]:
y_test

### Divided categorical and Numerical

In [None]:
#Encoding those data
nums_cols = [col for col in X_train.columns if X_train[col].dtype in ["float16","float32","float64"]]
catgo_cols = [col for col in X_train.columns if X_train[col].dtype not in ["float16","float32","float64"]]

In [None]:
# catgo_cols

### Encoder featurs
<!-- Divided categorical and Numerical -->

In [None]:
d_test = test

for cols in catgo_cols:
    enc = TargetEncoder(cols=[cols])
    x = enc.fit_transform(X_train, y_train)
    y= enc.transform(X_test)
    z = enc.transform(d_test)


In [None]:
del test

### Normalization
<!-- ### Encoder featurs
Divided categorical and Numerical -->

In [None]:
from sklearn.preprocessing import RobustScaler
scaler= RobustScaler()
scaler.fit(X_train)
train_X = pd.DataFrame(scaler.transform(x))
test_X = pd.DataFrame(scaler.transform(y))
test = pd.DataFrame(scaler.transform(z))

In [None]:
del x,y,z

In [None]:
train_X

### Convert Into Numpy array
<!-- ### Normalization
### Encoder featurs
Divided categorical and Numerical -->

In [None]:
train_X = train_X.to_numpy()
y_train =y_train.to_numpy()
y_test =y_test.to_numpy()
test_X =test_X.to_numpy()
test =test.to_numpy()

In [None]:
#after convert to numpy
train_X  

In [None]:
# XGBClassifier = xgb.XGBClassifier(max_depth = 9,
#                                  learning_rate = 0.008,
#                                  n_estimators = 20000,
# #                                  objective = 'binary:logistic',
#                                  objective = 'multi:softmax',
#                                  tree_method = 'gpu_hist',
#                                  booster = 'gbtree',
#                                  gamma = 0.75,
#                                  max_delta_step = 6,
#                                  min_child_weight = 9,
#                                  subsample = 0.8326,
#                                  colsample_bytree = 0.7,
#                                  colsample_bynode= 0.7523,
#                                  colsample_bylevel=0.8231,
#                                  predictor= 'gpu_predictor'
# #                                  n_jobs = -1
#                                  )

# Modeling 
Lets check several model first then we take the best accurate model for submission.

### XGBClassifier

In [None]:
from xgboost import XGBClassifier


# declare parameters
params = {
#             'objective':'binary:logistic',/
            'objective' : 'multi:softmax',
            'tree_method': 'gpu_hist',
            'eval_metric': 'mlogloss',
            'booster' : 'gbtree',
            'subsample' : 0.8326,
            'gamma' : 0.48,
            'max_depth': 7,
            'alpha': 10,
            'learning_rate': .027,
            'n_estimators':3000,
            'predictor': 'gpu_predictor'
        }       
           
          
# instantiate the classifier 
xgb = XGBClassifier(**params)
xgb.fit(train_X, y_train,
          early_stopping_rounds=200,
          eval_set=[(test_X,y_test)],
          verbose=True)


In [None]:
preds_valid = xgb.predict(test_X).astype('int')
acc = accuracy_score(y_test,  preds_valid)
print("accuracy score:", acc)

### CatBoostClassifier

In [None]:
from catboost import CatBoostClassifier
model = CatBoostClassifier( task_type = 'GPU',devices = '0')
model.fit(train_X, y_train)

In [None]:
preds_valid = model.predict(test_X).astype('int')
acc = accuracy_score(y_test,  preds_valid)
print("accuracy score:", acc)

### Submission

1. **XGB**

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')
sub['Cover_Type'] =xgb.predict(test).astype('int')
sub.to_csv("submission.csv",index=False)
sub.head()

2.  **CatBoostClassifier** 

In [None]:
sub['Cover_Type'] =model.predict(test).astype('int')
sub.to_csv("cat_submission.csv",index=False)
sub.head()

More works comming UP!!! If this notebook helps you please DO Upvote.