In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
train=pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
test=pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')

In [None]:
# Function to reduce memory

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                #if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #    df[col] = df[col].astype(np.float16)
                #el
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# Reducing memory usage

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
train.columns

In [None]:
train.corr()

In [None]:
train.describe()

In [None]:
#To replace the null values 
train.fillna(train.mean(),inplace=True)

In [None]:
for i in train.columns:
    x = train[i].value_counts()
    print("Column name is:",i,"and it value is:",x)
#Here we can see that it has no null values, but has only 0's. Hence, this is of no use to us.

Therefore, Soil_Type 7 and 15 can be removed because they only contain 0's.


In [None]:
train.drop(['Soil_Type7','Soil_Type15'],axis=1,inplace=True)


In [None]:
#sns barplots plot the means
f,ax=plt.subplots(5,2,figsize=(25,15))
sns.barplot(x=train.Cover_Type,y='Elevation',data=train,ci=None,ax=ax[0,0])

sns.barplot(x=train.Cover_Type,y='Aspect',data=train,ci=None,ax=ax[0,1])

sns.barplot(x=train.Cover_Type,y='Slope',data=train,ci=None,ax=ax[1,0])

sns.barplot(x=train.Cover_Type,y='Horizontal_Distance_To_Hydrology',data=train,ci=None,ax=ax[1,1])
sns.barplot(x=train.Cover_Type,y='Vertical_Distance_To_Hydrology',data=train,ci=None,ax=ax[2,0])
sns.barplot(x=train.Cover_Type,y='Horizontal_Distance_To_Roadways',data=train,ci=None,ax=ax[2,1])

sns.barplot(x=train.Cover_Type,y='Hillshade_9am',data=train,ci=None,ax=ax[3,0])

sns.barplot(x=train.Cover_Type,y='Hillshade_Noon',data=train,ci=None,ax=ax[3,1])

sns.barplot(x=train.Cover_Type,y='Hillshade_3pm',data=train,ci=None,ax=ax[4,0])
sns.barplot(x=train.Cover_Type,y='Horizontal_Distance_To_Fire_Points',data=train,ci=None,ax=ax[4,1])

We will create categorical columns for Wilderness Area and Soil Type so that these can be used for analysis.

In [None]:
#extracting the columns with Wilderness_Area data in them
Wild_cols=[col for col in train.columns if 'Wilderness_Area' in col]
Wild_cols

In [None]:
#Wilderness Area columns are stored in another dataset
Wild_data=train[Wild_cols]

In [None]:
Wild_data=test[Wild_cols]

In [None]:
train['Wilderness_Area']=Wild_data.apply(lambda x:x.idxmax(),axis=1)

In [None]:
test['Wilderness_Area']=Wild_data.apply(lambda x:x.idxmax(),axis=1)

In [None]:
train.drop(['Wilderness_Area1','Wilderness_Area2','Wilderness_Area3','Wilderness_Area4'],axis=1,inplace=True)

In [None]:
test.drop(['Wilderness_Area1','Wilderness_Area2','Wilderness_Area3','Wilderness_Area4'],axis=1,inplace=True)

In [None]:
train.Wilderness_Area.value_counts()

In [None]:
test.Wilderness_Area.value_counts()

In [None]:
soil_col=[col for col in train.columns if 'Soil_Type' in col]
soil_data=train[soil_col]

In [None]:
train['Soil_data']=soil_data.apply(lambda x: x.idxmax(), axis=1)

In [None]:
test['Soil_data']=soil_data.apply(lambda x: x.idxmax(), axis=1)

In [None]:
train.drop(['Soil_Type1','Soil_Type2','Soil_Type3','Soil_Type4','Soil_Type5','Soil_Type6','Soil_Type8','Soil_Type9','Soil_Type10','Soil_Type11','Soil_Type12','Soil_Type13','Soil_Type14','Soil_Type16','Soil_Type17','Soil_Type18','Soil_Type19','Soil_Type20','Soil_Type21','Soil_Type22','Soil_Type23','Soil_Type24','Soil_Type25','Soil_Type26','Soil_Type27','Soil_Type28','Soil_Type29','Soil_Type30','Soil_Type31','Soil_Type32','Soil_Type33','Soil_Type34','Soil_Type35','Soil_Type36','Soil_Type37','Soil_Type38','Soil_Type39','Soil_Type40'],axis=1,inplace=True)

In [None]:
test.drop(['Soil_Type1','Soil_Type2','Soil_Type3','Soil_Type4','Soil_Type5','Soil_Type6','Soil_Type7','Soil_Type8','Soil_Type9','Soil_Type10','Soil_Type11','Soil_Type12','Soil_Type13','Soil_Type14','Soil_Type15','Soil_Type16','Soil_Type17','Soil_Type18','Soil_Type19','Soil_Type20','Soil_Type21','Soil_Type22','Soil_Type23','Soil_Type24','Soil_Type25','Soil_Type26','Soil_Type27','Soil_Type28','Soil_Type29','Soil_Type30','Soil_Type31','Soil_Type32','Soil_Type33','Soil_Type34','Soil_Type35','Soil_Type36','Soil_Type37','Soil_Type38','Soil_Type39','Soil_Type40'],axis=1,inplace=True)

In [None]:
train.Soil_data.value_counts()

In [None]:
train.head()

In [None]:
train=pd.get_dummies(train)

In [None]:
test=pd.get_dummies(test)

In [None]:


X=train.drop(['Id','Cover_Type'],axis=1)
y=train.Cover_Type
X_train, X_test, y_train, y_test = train_test_split(
             X, y, test_size = 0.4, random_state=42)
 



# KNN

In [None]:
#KNN Classification
knn = KNeighborsClassifier(n_neighbors=7,metric='euclidean')
 
knn.fit(X_train, y_train)

In [None]:
y_pred_knn = knn.predict(X_test)

In [None]:
y_pred_knn=pd.DataFrame(y_pred_knn)
y_pred_knn.value_counts()

In [None]:
knn.score(X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred_knn)

# DECISION TREE CLASSIFICATION

In [None]:
# make predictions using Decision Tree for classification

from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [None]:
# define the model
model = DecisionTreeClassifier()


In [None]:
# fit the model on the whole dataset
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)


In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy is much more in case of Decision Tree Classifier than KNN. Hence, we will use the Decision Tree Model.

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(data=X_train, label=y_train)


In [None]:
params = {
    'max_depth': 3,
    'objective': 'multi:softmax',  # error evaluation for multiclass training
    'num_class': 8,
    'n_gpus': 0
}

In [None]:
bst = xgb.train(params, dtrain)

In [None]:
dtest = xgb.DMatrix(data=X_test)

In [None]:
 pred = bst.predict(dtest)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, pred))

In [None]:
confusion_matrix(y_test, pred)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier #Random Forest
# Random Forest
model4=RandomForestClassifier(n_estimators=100)
model4.fit(X_train,y_train.values.ravel())
RF_prediction=model4.predict(X_test)
print('The accuracy of the Random Forests model is \t\t',metrics.accuracy_score(RF_prediction,y_test))

In [None]:
test.drop(['id'],axis=1,inplace=True)
pred_final=model4.predict(test)

In [None]:
y_pred_new=pd.DataFrame(pred_final)
y_pred_new.index=test.Id

In [None]:
y_pred_new.rename({0:'Cover_Type'},axis=1,inplace=True)

In [None]:
y_pred_new['Cover_Type']=y_pred_new['Cover_Type'].astype(int)

In [None]:
y_pred_new.to_csv('submission.csv')

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn import metrics
# Naive Bayes
model2=GaussianNB()
model2.fit(X_train,y_train)
NB_prediction=model2.predict(X_test)
print('The accuracy of the NaiveBayes model is\t\t\t',metrics.accuracy_score(NB_prediction,y_test))

We can understand that Random Forest Classifier performs the best among all these models.