In [None]:
import pandas as pd
data=pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")
data.head()

In [None]:
import numpy as np
def reduce_mean_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtypes
        
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] =='int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min> np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print('Memory usage is decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem)/ start_mem))
    
    return df


In [None]:
data = reduce_mean_usage(data)
data.info()

In [None]:
Wilderness_Area_cols = [col for col in data.columns if 'Wilderness_Area' in col]
Wilderness_Area_cols

In [None]:
Wilderness_Area=data[Wilderness_Area_cols]
Wilderness_Area.head()

In [None]:
df_n = Wilderness_Area.apply(lambda x: x.idxmax(), axis = 1)
df_n.head()

In [None]:
Wilderness_Area=pd.DataFrame(df_n, columns=['Wilderness'])
Wilderness_Area.head()

In [None]:
Wilderness_Area.Wilderness.value_counts()

In [None]:
Wilderness_Area.Wilderness=Wilderness_Area.Wilderness.map({'Wilderness_Area1':'Area1',
                                                           'Wilderness_Area2':'Area2',
                                                          'Wilderness_Area3':'Area3',
                                                          'Wilderness_Area4':'Area4'})
Wilderness_Area.Wilderness.value_counts()

In [None]:
data=data.drop(['Wilderness_Area1','Wilderness_Area2','Wilderness_Area3','Wilderness_Area4'],axis=1)
data.head()

In [None]:
data=pd.concat([data, Wilderness_Area], axis=1)
del Wilderness_Area,df_n
data.head()

In [None]:
Soil_Type_cols = [col for col in data.columns if 'Soil_Type' in col]
Soil_Type_cols

In [None]:
Soil_Type=data[Soil_Type_cols]
Soil_Type.head()

In [None]:
df_n = Soil_Type.apply(lambda x: x.idxmax(), axis = 1)
df_n.head()

In [None]:
Soil_Type=pd.DataFrame(df_n, columns=['Soil'])
Soil_Type.head()

In [None]:
Soil_Type.Soil.value_counts()

In [None]:
Soil_Type.Soil = Soil_Type.Soil.str.replace('Soil_', '')
Soil_Type.Soil.value_counts()

In [None]:
data=data.drop(Soil_Type_cols,axis=1)
data.head()

In [None]:
data=pd.concat([data, Soil_Type], axis=1)
del Soil_Type,df_n
data.head()

In [None]:
data.info()

In [None]:
data.nunique()

In [None]:
round(data.Cover_Type.value_counts()*100/len(data),2)

In [None]:
data.Cover_Type.value_counts()

In [None]:
data=data[data.Cover_Type!=5]
data=data[data.Cover_Type!=4]
data=data[data.Cover_Type!=6]
data=data[data.Cover_Type!=7]
data=data[data.Cover_Type!=3]
data=data.sample(n=250000)
data.reset_index(drop=True, inplace=True)
data.shape

In [None]:
round(data.Cover_Type.value_counts()*100/len(data),2)

In [None]:
data.Cover_Type=data.Cover_Type.map({1:0,2:1})
round(data.Cover_Type.value_counts()*100/len(data),2)

In [None]:
data.Cover_Type.unique()

In [None]:
pd.options.display.float_format = '{:.2f}'.format
data.describe()

In [None]:
data.nunique().sort_values(ascending=False)

In [None]:
!pip install pycaret[full]

In [None]:
from pycaret.classification import *

In [None]:
forest= setup(data=data,
          target = "Cover_Type",  session_id=42,
          normalize=True,
          train_size = 0.8, # training over 80% of available data
          handle_unknown_categorical = True, 
          remove_multicollinearity = True, #drop one of the two features that are highly correlated with each other
          ignore_low_variance = True,#all categorical features with statistically insignificant variances are removed from the dataset.    
          ignore_features=['Id'],
          categorical_features=['Wilderness'],
              high_cardinality_features=['Soil'],
          combine_rare_levels = True,
          fix_imbalance = True,
          unknown_categorical_method= 'most_frequent',
          transformation = True,silent=True
         )

In [None]:
model_xgboost = create_model('xgboost')
model_xgboost 

In [None]:
plot_model(model_xgboost,plot = 'confusion_matrix',use_train_data=True)

In [None]:
plot_model(model_xgboost,plot = 'confusion_matrix')

In [None]:
plot_model(model_xgboost,plot = 'class_report',use_train_data=True)

In [None]:
plot_model(model_xgboost,plot = 'class_report')

In [None]:
predict_model(model_xgboost)

In [None]:
final_xgboost = finalize_model(model_xgboost)
final_xgboost

In [None]:
test=pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")
test.head()

In [None]:
test = reduce_mean_usage(test)
test.info()

In [None]:
Wilderness_Area_cols = [col for col in test.columns if 'Wilderness_Area' in col]
Wilderness_Area_cols

In [None]:
Wilderness_Area=test[Wilderness_Area_cols]
Wilderness_Area.head()

In [None]:
df_n = Wilderness_Area.apply(lambda x: x.idxmax(), axis = 1)
df_n.head()

In [None]:
Wilderness_Area=pd.DataFrame(df_n, columns=['Wilderness'])
Wilderness_Area.head()

In [None]:
Wilderness_Area.Wilderness.value_counts()

In [None]:
Wilderness_Area.Wilderness=Wilderness_Area.Wilderness.map({'Wilderness_Area1':'Area1',
                                                           'Wilderness_Area2':'Area2',
                                                          'Wilderness_Area3':'Area3',
                                                          'Wilderness_Area4':'Area4'})
Wilderness_Area.Wilderness.value_counts()

In [None]:
test=test.drop(['Wilderness_Area1','Wilderness_Area2','Wilderness_Area3','Wilderness_Area4'],axis=1)
test.head()

In [None]:
test=pd.concat([test, Wilderness_Area], axis=1)
test.head()

In [None]:
Soil_Type_cols = [col for col in test.columns if 'Soil_Type' in col]
Soil_Type_cols

In [None]:
Soil_Type=test[Soil_Type_cols]
Soil_Type.head()

In [None]:
df_n = Soil_Type.apply(lambda x: x.idxmax(), axis = 1)
df_n.head()

In [None]:
Soil_Type=pd.DataFrame(df_n, columns=['Soil'])
Soil_Type.head()

In [None]:
Soil_Type.Soil = Soil_Type.Soil.str.replace('Soil_', '')
Soil_Type.Soil.value_counts()

In [None]:
test=test.drop(Soil_Type_cols,axis=1)
test.head()

In [None]:
test=pd.concat([test, Soil_Type], axis=1)
test.head()

In [None]:
predictions=predict_model(final_xgboost,data =test)
predictions.head()

In [None]:
predictions=predictions[['Id','Label']]
predictions.head()

In [None]:
predictions.Label.value_counts()

In [None]:
predictions.Label=predictions.Label.map({0:1,1:2})
predictions.Label.value_counts()

In [None]:
predictions.columns = ['Id', 'Cover_Type']
predictions.head()

In [None]:
predictions.to_csv('./predictions.csv',index=False)