In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# read the data 
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv')

In [None]:
train_df.head()

In [None]:
# Information about the data in a glance

train_df.info()

In [None]:
train_df.shape

In [None]:
# read the test data
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/test.csv')

In [None]:
test_df.head()

In [None]:
test_df.shape

In [None]:
def checkMissingData(df):
    # Checking for missing data in data

    total = df.isnull().sum().sort_values(ascending=False)
    percent_1 = df.isnull().sum()/df.isnull().count()*100
    percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
    missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
    return missing_data

In [None]:
# Check missing data in train dataset
checkMissingData(train_df)

In [None]:
# Check missing data in test dataset
checkMissingData(test_df)

In [None]:
def getFeatureDf(df):
#     feature = df.iloc[:, 1::1]
#     if 'Cover_Type' in feature.columns:
#         feature = feature.drop(columns = ['Cover_Type'],axis=1)
    return df[[col for col in df.columns if col not in ['Cover_Type','Id']]]

In [None]:
# Getting feature for Train Dataset
feature = getFeatureDf(train_df)
feature.head()

In [None]:
feature_test = getFeatureDf(test_df)
feature_test.head()

In [None]:
feature_test.shape

In [None]:
target = train_df['Cover_Type']
target.head()

In [None]:
# Separates features and corresponding labels/target 

X = feature.copy(deep=True)  #  X will hold all features
y = target.copy(deep=True)  # y will hold target/labels

print(X.shape) #dimensions of input data
print(y.shape) #dimensions of output data

In [None]:
# Splitting train dataset to know the accuracy of model. Later will use best model for submission.csv
from sklearn.model_selection import train_test_split 

# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.1, random_state = 1) 
print(X_train.shape)
print(X_test.shape)

In [None]:
# Training using Decision Tree Classifier 
from sklearn.tree import DecisionTreeClassifier

In [None]:
giniDTCClassifier = DecisionTreeClassifier(criterion = 'gini')  
giniDTCClassifier.fit(X_train,y_train)

# Check Criteria  ?
print(giniDTCClassifier)

In [None]:
# Using information gain 

entrophyDTCClassifier = DecisionTreeClassifier(criterion='entropy')  
entrophyDTCClassifier.fit(X_train, y_train) 

In [None]:
# predict using both the classifier 

y_pred_1 = giniDTCClassifier.predict(X_test)  

y_pred_2 = entrophyDTCClassifier.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score #importing accuracy_score function from sklearn.metrics package

# Get Accuracy for Model
def getAccuracy(message,y_test,y_pred):
    acc = accuracy_score(y_test,y_pred)*100
    print(message + str(acc)+ "%")
    return acc

In [None]:
# Get Accuracy for Decision Tree Classifier Gini Model
acc1 = getAccuracy("Accuracy for Decision Tree Classifier Gini model on Test Data: ",y_test,y_pred_1)

# Get Accuracy for Decision Tree Classifier Entropy Model
acc2 = getAccuracy("Accuracy for Decision Tree Classifier Entropy model on Test Data: ",y_test,y_pred_2)

In [None]:
from sklearn.metrics import confusion_matrix

print('Confusion Matrix for Gini Model: ')
print(confusion_matrix(y_test,y_pred_1))

print('Confusion Matrix for Entropy Model: ')
print(confusion_matrix(y_test,y_pred_2))

In [None]:
from sklearn.metrics import balanced_accuracy_score

print('Balanced accuracy for gini Model',balanced_accuracy_score(y_test,y_pred_1)*100,'%')

print('Balanced accuracy for entropy Model',balanced_accuracy_score(y_test,y_pred_2)*100,'%')

In [None]:
# Training the model for submission based on above accuracy
criterion = 'gini'

if(acc2>acc1):
    criterion = 'entropy'
    
classifier = DecisionTreeClassifier(criterion = criterion)  
classifier.fit(X,y)

# Check Criteria  ?
print(classifier)

In [None]:
# predict using both the classifier 

predictions = classifier.predict(feature_test)  

In [None]:
test_df.shape

In [None]:
X_test.shape

In [None]:
output = pd.DataFrame({'Id': test_df.Id, 'Cover_Type': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")