In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
#You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
sample=pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/sample_submission.csv')
train=pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv')
test=pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/test.csv')

**Train Data**

In [None]:
train.describe().T[1:].sort_values(by='mean',ascending=False).style.background_gradient()

In [None]:
train=train.drop('Id',axis=True)

In [None]:
train.info()

In [None]:
train.isnull().sum()

> Check for any null values

**Reduce Memory Usage**

In [None]:
#This function I have took reference  for reducing memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is:{:.1f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train=reduce_mem_usage(train)
test=reduce_mem_usage(test)

In [None]:
train.info()

In [None]:
train

In [None]:
import plotly.express as px
px.pie(names=train['Cover_Type'],title='Cover_Type Distributions')

> Cover Type 5 has got just 0.000025

**Continuous variables**

In [None]:

#fig,axes=plt.subplots(3,2)

fig, ax = plt.subplots(5,2 ,figsize=(20,20))
for i,feature in enumerate(train.columns[:10]):
    plt.subplot(5,2,i+1)
    sns.histplot(data=train,x=train[feature],color='green')
    plt.xlabel(feature,color='green')
    
   
plt.show();

* **Aspect values should be in the range of 0-360,but there are some negative values and values greater than 360**
* **Hillshade values are having negative values,so we should check that also**

In [None]:
plt.figure(figsize=(30,20))
mask=np.triu(np.ones_like(train.corr()))
sns.heatmap(train.corr(),cmap='coolwarm',mask=mask)

**Correcting the values of the features**

In [None]:
def feat(df):
    df.loc[df['Hillshade_3pm']<0,'Hillshade_3pm']=0
    df.loc[df['Hillshade_9am']<0,'Hillshade_9am']=0
    df.loc[df['Hillshade_3pm']>255,'Hillshade_3pm']=255
    df.loc[df['Hillshade_9am']>255,'Hillshade_9am']=255
    df.loc[df['Hillshade_Noon']>255,'Hillshade_Noon']=255
    df.loc[df['Aspect']<0,'Aspect']+=360
    df.loc[df['Aspect']>359,'Aspect']-=360


In [None]:
feat(train)
feat(test)

**Model**

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:


X=train.drop('Cover_Type',axis=True)
Y=train['Cover_Type']

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.32)

In [None]:
acc=[]
for i in range(100,1000,100):
    xgb=XGBClassifier(n_estimators=i,n_jobs=-1,booster='gbtree',predictor='gpu_predictor',tree_method='gpu_hist')
    xgb.fit(X_train,Y_train)
    predi=xgb.predict(X_test)
    acc.append(accuracy_score(Y_test,predi))
    

In [None]:
sns.lineplot(x=range(100,1000,100),y=acc)

> n_estimator value 700 should be good

In [None]:
xgb=XGBClassifier(n_estimators=700,n_jobs=-1,booster='gbtree',predictor='gpu_predictor',tree_method='gpu_hist')

In [None]:
xgb.fit(X_train,Y_train)

In [None]:
pred=xgb.predict(X_test)

In [None]:
accuracy_score(Y_test,pred)

**Test data**

In [None]:
test=test.drop('Id',axis=True)


In [None]:
testpredict=xgb.predict(test)

In [None]:
submission=pd.DataFrame({'Id':sample['Id'],'Cover_Type':testpredict})

In [None]:
submission

In [None]:
submission=submission.to_csv('submission.csv',index=False)

In [None]:
submission
