In [None]:
import csv
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from matplotlib.pyplot import figure
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
import statistics
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
import os

In [None]:
trainData= pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
testData = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')
out1=testData['Id']

In [None]:
trainData.head()

In [None]:
testData.head()

In [None]:
trainData.columns

In [None]:
# checking for null values for both training and testing data
trainData.isnull().sum().sum()

In [None]:
testData.isnull().sum().sum()

In [None]:
# Checking for class imbalance
trainData.groupby("Cover_Type").Cover_Type.hist()
trainData['Cover_Type'].value_counts()

In [None]:
## cover type = 5 has just 1 dataset for training. This is of very little help to us. So, dropping it.
trainData.drop(trainData[trainData['Cover_Type']==5].index,inplace=True)

# EDA 

In [None]:
# Dividing the training dataset into continuous and one-hot encoded ones for futher EDA
trainData.columns
trainDataSet1 = trainData[['Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points','Cover_Type']]

trainDataSet2=trainData[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40',
       'Cover_Type']]

In [None]:
for column in trainDataSet1:
    plt.figure()
    sns.boxplot(y = trainDataSet1[column],
            x = trainDataSet1['Cover_Type'])

In [None]:
# Checking the correlation matrix
corr_matrix1 = trainDataSet1.corr()
corr_matrix2 = trainDataSet2.corr()
corr_matrix3 = trainData.corr()

In [None]:
# plotting correlation heatmap
dataplot1 = sns.heatmap(corr_matrix1, cmap="YlGnBu", annot=False)

In [None]:
dataplot2 = sns.heatmap(corr_matrix2, cmap="YlGnBu", annot=False)

In [None]:
dataplot3 = sns.heatmap(corr_matrix3, cmap="YlGnBu", annot=False)

In [None]:
## dropping columns 'Soil_Type7' and 'Soil_Type15' and 'Id' from training dataset

trainData=trainData.drop(['Id','Soil_Type7','Soil_Type15'],axis=1)
testData=testData.drop(['Id','Soil_Type7','Soil_Type15'],axis=1)

In [None]:
trainData['Elevation_sq'] = trainData.Elevation*trainData.Elevation
testData['Elevation_sq'] = testData.Elevation*testData.Elevation
trainData['Elevation_cube'] = trainData.Elevation*trainData.Elevation*trainData.Elevation
testData['Elevation_cube'] = testData.Elevation*testData.Elevation*testData.Elevation

In [None]:
## mapping all values between 0 to 360 degree range
trainData['Aspect_2'] = trainData.Aspect.map(lambda x : x-360 if x>360 else (x+360 if x<0  else x))
testData['Aspect_2'] = testData.Aspect.map(lambda x : x-360 if x>360 else (x+360 if x<0  else x))


In [None]:
trainData['Slope_wo_negative'] = trainData.Slope.map(lambda x: x if x>0 else 0)
testData['Slope_wo_negative'] = testData.Slope.map(lambda x: x if x>0 else 0)

In [None]:
trainData['slope_tan'] = np.tan(trainData.Slope*(3.14/180))
testData['slope_tan'] = np.tan(testData.Slope*(3.14/180))

In [None]:
trainData['Wilderness_freq'] = trainData.Wilderness_Area1 + trainData.Wilderness_Area2 + trainData.Wilderness_Area3 + trainData.Wilderness_Area4
testData['Wilderness_freq'] = testData.Wilderness_Area1 + testData.Wilderness_Area2 + testData.Wilderness_Area3 + testData.Wilderness_Area4

trainData['Hillshade_avg'] = trainData[['Hillshade_9am','Hillshade_3pm','Hillshade_Noon']].agg(func=np.mean,axis=1)
testData['Hillshade_avg'] = testData[['Hillshade_9am','Hillshade_3pm','Hillshade_Noon']].agg(func=np.mean,axis=1)

trainData['Hillshade_std'] = trainData[['Hillshade_9am','Hillshade_3pm','Hillshade_Noon']].agg(func=np.std,axis=1)
testData['Hillshade_std'] = testData[['Hillshade_9am','Hillshade_3pm','Hillshade_Noon']].agg(func=np.std,axis=1)

# converting negative to positive
trainData['hor_dist_road'] = trainData.Horizontal_Distance_To_Roadways.map(lambda x: abs(x) if x<0 else x)
testData['hor_dist_road'] = testData.Horizontal_Distance_To_Roadways.map(lambda x: abs(x) if x<0 else x)

# max threshold for features
max_threshold = 5000
trainData['hor_dist_road'] = trainData.hor_dist_road.map(lambda x: max_threshold if x>max_threshold else x)
testData['hor_dist_road'] = testData.hor_dist_road.map(lambda x: max_threshold if x>max_threshold else x)


# converting negative to positive
trainData['hor_dist_fire'] = trainData.Horizontal_Distance_To_Fire_Points.map(lambda x: abs(x) if x<0 else x)
testData['hor_dist_fire'] = testData.Horizontal_Distance_To_Fire_Points.map(lambda x: abs(x) if x<0 else x)

# max threshold for features
max_threshold = 5000
trainData['hor_dist_fire'] = trainData.hor_dist_fire.map(lambda x: max_threshold if x>max_threshold else x)
testData['hor_dist_fire'] = testData.hor_dist_fire.map(lambda x: max_threshold if x>max_threshold else x)





In [None]:
# Prepping the data 
# separating features and their labels
X=trainData.drop('Cover_Type',axis=1)
y=trainData['Cover_Type']

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_memory = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_memory = df.memory_usage().sum() / 1024**2
    if verbose: 
        print(f"Memory usage of dataframe after reduction {end_memory} MB")
        print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

In [None]:
X = reduce_mem_usage(X)
testData = reduce_mem_usage(testData)

In [None]:
from sklearn.tree import DecisionTreeClassifier

tps_clf = RandomForestClassifier(random_state=42)

In [None]:
tps_clf.fit(X,y)

In [None]:
del trainData
del trainDataSet1
del trainDataSet2
del corr_matrix1
del corr_matrix2
del corr_matrix3

In [None]:
y_pred=tps_clf.predict(testData)

In [None]:
out1 = pd.DataFrame(out1, columns = ['Id'])
y_pred = pd.DataFrame(y_pred, columns = ['Cover_Type'])
frame3 = pd.concat([out1, y_pred], axis=1)
frame3.to_csv('submission.csv', index = False)

In [None]:
X.head()

In [None]:
imp_values = tps_clf.feature_importances_

imp_df = pd.DataFrame()
imp_df['Feature_Name'] = list(X.columns)
imp_df['Values'] = imp_values

imp_df= imp_df.sort_values('Values', ascending = False)

imp_df.head(30)