# Imports

In [None]:
# Data Manipulation and Linear Algebra
import pandas as pd
import numpy as np

# Plots
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

from catboost import CatBoostClassifier

#ignore warning messages 
import warnings
warnings.filterwarnings('ignore')

# Preprocessing and Analysis

## Getting the Data

In [None]:
Base_Path = "../input/tabular-playground-series-dec-2021/"

train = pd.read_csv(Base_Path + "train.csv")
test = pd.read_csv(Base_Path + "test.csv")

In [None]:
# Getting the Shape of the Data
print(f'''
Training Data
    Rows    : {train.shape[0]}
    Columns : {train.shape[1]}

Testing Data
    Rows    : {test.shape[0]}
    Columns : {test.shape[1]}
''')

Last Column in Training Data is our target variable. So we have 55 Columns in our Data.

## Seperating Target Variable from our Training Data

In [None]:
train_features = train.drop(columns=["Cover_Type"])
train_target = train["Cover_Type"]

In [None]:
# Counting Number of Numeric and Object type Columns in our Training Data
print(f'''
Count of Numeric Columns : {len(train_features.select_dtypes(include=np.number).columns.tolist())}
Count of Object Columns  : {len(train_features.select_dtypes(include=['object']).columns.tolist())}
''')

 - All the Columns in our Data are of Numeric type so we dont have to Encode and Object Type Data.

## Checking for Null Values

In [None]:
print(f'''
Count of Columns with Null Values
    Training Data : {len(train_features.columns[train_features.isnull().any()].tolist())}
    Testing Data  : {len(test.columns[test.isnull().any()].tolist())}
''')

In [None]:
train_features.describe().T

 - "Soil_Type7" and "Soil_Type15" have only one value that is 0 for all records, So Dropping those columns.
 - Also Dropping the Id Columns because it is just a secondary Index and is of no use.

In [None]:
train_features.drop(columns = ["Id", "Soil_Type7", "Soil_Type15"], inplace=True)
test.drop(columns = ["Id", "Soil_Type7", "Soil_Type15"], inplace=True)

## Finding Categorical Columns

 - From the table returned by .describe() method we can observe and say that the first 10 column i.e., from "Elevation" to "Horizontal_Distance_To_Fire_Points" are Continious Columns where as other are Categorical Columns with 0 and 1 in them.

In [None]:
cont_cols = train_features.columns[:10]
cate_cols = train_features.columns[10:]

print(f'''
List of Continious Columns :
    {cont_cols}

List of Categorical Columns :
    {cate_cols}
''')

# EDA

## CountPlot for Target Variable

In [None]:
plt.figure(figsize=(10, 6), dpi=80)
sns.countplot(train_target)
plt.xlabel("Cover Type", fontsize=14)
plt.ylabel("")
plt.title("Cover Type Value Count", fontdict={"fontweight": "bold", "fontsize": 16})
plt.show()

# KDE plot (Distribution plot) for the Continious Variables

In [None]:
fig, axes = plt.subplots(2, 5, figsize=(25, 10))

count = 0
for i in range(2):
    for j in range(5):
        col_name = cont_cols[count]

        sns.kdeplot(train_features[col_name], ax=axes[i, j], color="#5BDE54", label='Train data')
        sns.kdeplot(test[col_name], ax=axes[i, j], color="#DE5454", label='Test data')

        axes[i, j].set_xlabel(col_name.capitalize(), fontsize=8, fontweight='bold')
        axes[i, j].set_ylabel('')

        count += 1

## Countplot for Categorical Variables

In [None]:
fig, axes = plt.subplots(9, 5, figsize=(25, 50))

count = 0
for i in range(9):
    for j in range(5):
        if count < 42:
            col_name = cate_cols[count]

            sns.countplot(train_features[col_name], ax=axes[i, j], color="#5BDE54", label='Train data')
            sns.countplot(test[col_name], ax=axes[i, j], color="#DE5454", label='Test data')

            axes[i, j].set_title(f"{col_name.capitalize()} Count Plot", fontdict={"fontweight": "bold"})
            axes[i, j].set_xlabel("")
            axes[i, j].set_ylabel("")

            count += 1
        else : break

## Correlation Heatmatrix for Continious Variables

In [None]:
temp_data = pd.concat([train_features[cont_cols], train_target], axis=1)

corr_matrix = temp_data.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap="viridis")
plt.title("Coorelation Heatmap - Continious Variables", fontdict={"fontsize": 14, "fontweight": "bold"})
plt.show()

# Feature Engineering

In [None]:
# Summing the all the values of Categorical Values for every Record
cat_sum = train_features[cate_cols].sum(axis=1)
cat_sum_val = test[cate_cols].sum(axis=1)

# Adding our New Feature to Data
train_features["Cat_Sum"] = cat_sum
test["Cat_Sum"] = cat_sum_val

# Drop the Columns not in Use
train_features.drop(columns=cate_cols, inplace=True)
test.drop(columns=cate_cols, inplace=True)

In [None]:
train_features["mean"] = train_features[cont_cols].mean(axis=1)
train_features["std"] = train_features[cont_cols].std(axis=1)
train_features["min"] = train_features[cont_cols].min(axis=1)
train_features["max"] = train_features[cont_cols].max(axis=1)

test["mean"] = test[cont_cols].mean(axis=1)
test["std"] = test[cont_cols].std(axis=1)
test["min"] = test[cont_cols].min(axis=1)
test["max"] = test[cont_cols].max(axis=1)

In [None]:
train_features

In [None]:
test

# Modeling

## Scaling the Continious Variables

In [None]:
standardscaler = StandardScaler()

scaled_data_features = standardscaler.fit_transform(train_features)
scaled_val_data = standardscaler.transform(test)

In [None]:
scaled_data_features

In [None]:
scaled_val_data

## Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_data_features, train_target, test_size=0.2)

X_train.shape, X_test.shape

In [None]:
catb_params = {
    "objective": "MultiClass",
    "task_type": "GPU",
    "silent": True,
}

# Initializing the Classifier
catboostclassifier = CatBoostClassifier(**catb_params)

# Training the Classifier
catboostclassifier.fit(X_train, y_train, verbose=False)

# Making Prediction on Testing Data
y_pred = catboostclassifier.predict(X_test)

# Checking Performance of Classifier
print(classification_report(y_test, y_pred))

# Submission

## Getting Sample Submission File for Ids

In [None]:
sample_submission = pd.read_csv(Base_Path + "sample_submission.csv")

## Getting Predictions

In [None]:
pred = catboostclassifier.predict(scaled_val_data)

In [None]:
# Creating A Dataframe for submission file
submission_df = pd.DataFrame()
submission_df["Id"] = sample_submission.Id
submission_df["Cover_Type"] = pred

# Saving the Submission file as csv file
submission_df.to_csv("submission.csv", index=False)