# Tabular Playground Series - Dec 2021

In [None]:
#import libraries
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
# reduce memory usage function
# credits : Guillaume Martin (https://www.kaggle.com/gemartin/load-data-reduce-memory-usage/notebook)

def reduce_memory_usage(df):
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    return df;

In [None]:
#read csv
df=pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")
df_test=pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")

In [None]:
reduce_memory_usage(df)
reduce_memory_usage(df_test);

### EDA

In [None]:
#first 5 rows
df.head()

In [None]:
#concise summary
df.info()

In [None]:
#descriptive summary
df.describe().T.sort_values(by='std',ascending=False)

In [None]:
#missing values
df.isnull().any()

In [None]:
#target distribution
sns.countplot(x="Cover_Type",data=df,palette='Blues_r');


In [None]:
df["Cover_Type"].value_counts()

### Preprocessing

In [None]:
# predicators
X=df.drop(['Id','Cover_Type','Soil_Type7','Soil_Type15'],axis=1)

# target
y=df['Cover_Type']

# test data
test=df_test.drop(['Id','Soil_Type7','Soil_Type15'],axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.15,random_state=15)


### XGBClassifier

In [None]:
from xgboost import XGBClassifier
clf=XGBClassifier(learning_rate=0.3, tree_method='gpu_hist', random_state=6)
clf.fit(X_train,y_train)

In [None]:
#validation prediction
y_pred=clf.predict(X_val)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val,y_pred)

In [None]:
#test prediction
pred=clf.predict(test)

In [None]:
prediction=pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')
prediction['Cover_Type']=pred
prediction.to_csv('submission.csv',index=False)
prediction.head()

### Thank You